Update ROCm docs core to 1.23

Fix documented VRAM for Radeon AI Pro R9700 (#5203 ) (#5206 )
(cherry picked from commit c154b7e0a3)
2026-01-09 22:58:17 -05:00 · 2025-10-02 08:34:17 +02:00 · 2025-08-18 10:19:44 -04:00 · 2025-08-12 14:26:03 -04:00 · 2025-08-01 13:18:07 -04:00 · 2025-07-29 09:45:52 -04:00
101 changed files with 11980 additions and 3424 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -77,7 +77,8 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      componentName: clr
-      cmakeBuildDir: 'clr/build'
+      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
      extraBuildFlags: >-
        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
        -DHIP_PLATFORM=amd
@@ -138,7 +139,8 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      componentName: clr
-      cmakeBuildDir: 'clr/build'
+      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
      extraBuildFlags: >-
        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
        -DHIP_PLATFORM=nvidia
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -73,6 +73,7 @@ jobs:
    parameters:
      componentName: upstream-llvm
      cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
+      cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
      installDir: $(Pipeline.Workspace)/llvm
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
--- a/.azuredevops/components/aomp.yml
+++ b/.azuredevops/components/aomp.yml
@@ -118,6 +118,7 @@ jobs:
    parameters:
      componentName: extras
      cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
      installDir: '$(Build.BinariesDirectory)/llvm'
      extraBuildFlags: >-
        -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
@@ -129,6 +130,7 @@ jobs:
    parameters:
      componentName: openmp
      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
      installDir: '$(Build.BinariesDirectory)/llvm'
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
@@ -155,6 +157,7 @@ jobs:
    parameters:
      componentName: offload
      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
      installDir: '$(Build.BinariesDirectory)/llvm'
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -92,7 +92,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: external
-        cmakeBuildDir: 'deps/build'
+        cmakeBuildDir: '$(Build.SourcesDirectory)/deps/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/deps'
        installDir: '$(Pipeline.Workspace)/deps-install'
        extraBuildFlags: >-
          -DBUILD_BOOST=OFF
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -83,7 +83,8 @@ jobs:
        -DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
        -DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
        -GNinja
-      cmakeBuildDir: 'llvm/build'
+      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
      installDir: '$(Build.BinariesDirectory)/llvm'
 # use llvm-lit to run unit tests for llvm, clang, and lld
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
@@ -121,7 +122,8 @@ jobs:
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
        -DCMAKE_BUILD_TYPE=Release
-      cmakeBuildDir: 'amd/device-libs/build'
+      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      componentName: comgr
@@ -129,7 +131,8 @@ jobs:
        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
        -DCOMGR_DISABLE_SPIRV=1
        -DCMAKE_BUILD_TYPE=Release
-      cmakeBuildDir: 'amd/comgr/build'
+      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: comgr
@@ -142,7 +145,8 @@ jobs:
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
        -DHIPCC_BACKWARD_COMPATIBILITY=OFF
-      cmakeBuildDir: 'amd/hipcc/build'
+      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/rdc.yml
+++ b/.azuredevops/components/rdc.yml
@@ -105,6 +105,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
+        cmakeSourceDir: $(Build.SourcesDirectory)/grpc
        installDir: $(Build.SourcesDirectory)/bin
        extraBuildFlags: >-
          -DgRPC_INSTALL=ON
--- a/.azuredevops/components/rocAL.yml
+++ b/.azuredevops/components/rocAL.yml
@@ -125,6 +125,7 @@ jobs:
      parameters:
        componentName: PyBind11
        cmakeBuildDir: '$(Build.SourcesDirectory)/pybind11/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/pybind11'
        customInstallPath: false
        installEnabled: false
        extraBuildFlags: >-
@@ -141,6 +142,7 @@ jobs:
      parameters:
        componentName: RapidJSON
        cmakeBuildDir: '$(Build.SourcesDirectory)/rapidjson/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/rapidjson'
        customInstallPath: false
        installEnabled: false
        extraBuildFlags: >-
@@ -200,7 +202,6 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm/include/rocal
    pool:
      name: ${{ job.target }}_test_pool
-      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -108,7 +108,6 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
-      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocJPEG.yml
+++ b/.azuredevops/components/rocJPEG.yml
@@ -114,7 +114,6 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
-      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -5,6 +5,12 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+- name: sparseCheckout
+  type: boolean
+  default: false
+- name: sparseCheckoutDir
+  type: string
+  default: ''
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -66,6 +72,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckout: ${{ parameters.sparseCheckout }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -168,7 +168,6 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
-      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -105,6 +105,7 @@ jobs:
          -DLAPACKE=OFF
          -GNinja
        cmakeBuildDir: '$(Build.SourcesDirectory)/lapack/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/lapack'
        installDir: '$(Pipeline.Workspace)/deps-install'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -167,7 +167,6 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
-      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/dependencies/grpc.yml
+++ b/.azuredevops/dependencies/grpc.yml
@@ -38,6 +38,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      cmakeBuildDir: $(Agent.BuildDirectory)/grpc/build
+      cmakeSourceDir: $(Agent.BuildDirectory)/grpc
      extraBuildFlags: >-
        -DgRPC_INSTALL=ON
        -DgRPC_BUILD_TESTS=OFF
--- a/.azuredevops/dependencies/gtest.yml
+++ b/.azuredevops/dependencies/gtest.yml
@@ -38,6 +38,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
+      cmakeSourceDir: $(Agent.BuildDirectory)/googletest
      extraBuildFlags: >-
        -DGTEST_FORCE_SHARED_CRT=ON
        -DCMAKE_DEBUG_POSTFIX=d
--- a/.azuredevops/templates/steps/build-cmake.yml
+++ b/.azuredevops/templates/steps/build-cmake.yml
@@ -10,10 +10,10 @@ parameters:
  default: ''
 - name: cmakeBuildDir
  type: string
-  default: 'build'
+  default: $(Agent.BuildDirectory)/s/build
 - name: cmakeSourceDir
  type: string
-  default: '..'
+  default: $(Agent.BuildDirectory)/s
 - name: customBuildTarget
  type: string
  default: ''
@@ -46,7 +46,7 @@ steps:
    ${{ if eq(parameters.customInstallPath, true) }}:
      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
    ${{ else }}:
-      cmakeArgs: ${{ parameters.extraBuildFlags }} ..
+      cmakeArgs: ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
 - ${{ if parameters.printDiskSpace }}:
  - script: df -h
    displayName: Disk space before build
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -4,6 +4,12 @@ parameters:
 - name: checkoutRepo
  type: string
  default: 'self'
+- name: sparseCheckout
+  type: boolean
+  default: false
+- name: sparseCheckoutDir
+  type: string
+  default: ''
 # submodule download behaviour
 # change to 'recursive' for repos with submodules
 - name: submoduleBehaviour
@@ -15,3 +21,13 @@ steps:
    clean: true
    submodules: ${{ parameters.submoduleBehaviour }}
    retryCountOnTaskFailure: 3
+    fetchFilter: blob:none
+    ${{ if eq(parameters.sparseCheckout, true) }}:
+      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
+      path: sparse
+  - ${{ if eq(parameters.sparseCheckout, true) }}:
+    - task: Bash@3
+      displayName: Symlink sparse checkout
+      inputs:
+        targetType: inline
+        script: ln -s $(Agent.BuildDirectory)/sparse/${{ parameters.sparseCheckoutDir }} $(Agent.BuildDirectory)/s
--- a/.azuredevops/templates/steps/docker-container.yml
+++ b/.azuredevops/templates/steps/docker-container.yml
@@ -106,6 +106,7 @@ parameters:
  type: object
  default:
    - gfx90a
+    - gfx942

 steps:
 # these steps should only be run if there was a failure or warning
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -6,6 +6,7 @@ ACS
 AccVGPR
 AccVGPRs
 ALU
+AllReduce
 AMD
 AMDGPU
 AMDGPUs
@@ -13,6 +14,7 @@ AMDMIGraphX
 AMI
 AOCC
 AOMP
+AOT
 AOTriton
 APBDIS
 APIC
@@ -32,8 +34,10 @@ Andrej
 Arb
 Autocast
 BARs
+BatchNorm
 BLAS
 BMC
+BabelStream
 Blit
 Blockwise
 Bluefield
@@ -78,10 +82,13 @@ ConnectX
 CuPy
 da
 Dashboarding
+Dataloading
 DBRX
 DDR
 DF
 DGEMM
+DGL
+DGLGraph
 dGPU
 dGPUs
 DIMM
@@ -99,6 +106,7 @@ DataFrame
 DataLoader
 DataParallel
 Debian
+decompositions
 DeepSeek
 DeepSpeed
 Dependabot
@@ -124,10 +132,12 @@ FX
 Filesystem
 FindDb
 Flang
+FlashAttention
 FluxBenchmark
 Fortran
 Fuyu
 GALB
+GAT
 GCC
 GCD
 GCDs
@@ -138,6 +148,7 @@ GDR
 GDS
 GEMM
 GEMMs
+GFLOPS
 GFortran
 GFXIP
 Gemma
@@ -154,6 +165,8 @@ GPT
 GPU
 GPU's
 GPUs
+Graphbolt
+GraphSage
 GRBM
 GenAI
 GenZ
@@ -166,6 +179,7 @@ HIPCC
 HIPExtension
 HIPIFY
 HIPification
+hipification
 HIPify
 HPC
 HPCG
@@ -180,6 +194,7 @@ Higgs
 Hyperparameters
 Huggingface
 ICD
+ICT
 ICV
 IDE
 IDEs
@@ -214,6 +229,7 @@ KV
 KVM
 Karpathy's
 KiB
+Kineto
 Keras
 Khronos
 LAPACK
@@ -226,6 +242,7 @@ LM
 LSAN
 LSan
 LTS
+LSTMs
 LanguageCrossEntropy
 LoRA
 MEM
@@ -262,6 +279,7 @@ Miniconda
 MirroredStrategy
 Mixtral
 MosaicML
+Mpops
 Multicore
 Multithreaded
 MyEnvironment
@@ -270,10 +288,12 @@ NBIO
 NBIOs
 NCCL
 NCF
+NFS
 NIC
 NICs
 NLI
 NLP
+NN
 NPKit
 NPS
 NSP
@@ -310,6 +330,7 @@ OpenMPI
 OpenSSL
 OpenVX
 OpenXLA
+Optim
 Oversubscription
 PagedAttention
 Pallas
@@ -348,6 +369,7 @@ RDC's
 RDMA
 RDNA
 README
+Recomputation
 RHEL
 RMW
 RNN
@@ -380,6 +402,7 @@ Ryzen
 SALU
 SBIOS
 SCA
+ScaledGEMM
 SDK
 SDMA
 SDPA
@@ -420,6 +443,8 @@ TCI
 TCIU
 TCP
 TCR
+TensorRT
+TensorFloat
 TF
 TFLOPS
 TP
@@ -498,6 +523,7 @@ ZenDNN
 accuracies
 activations
 addr
+ade
 ai
 alloc
 allocatable
@@ -505,6 +531,7 @@ allocator
 allocators
 amdgpu
 api
+aten
 atmi
 atomics
 autogenerated
@@ -513,6 +540,7 @@ avx
 awk
 backend
 backends
+bb
 benchmarked
 benchmarking
 bfloat
@@ -536,6 +564,7 @@ cd
 centos
 centric
 changelog
+checkpointing
 chiplet
 cmake
 cmd
@@ -576,6 +605,7 @@ de
 deallocation
 debuggability
 debian
+deepseek
 denoise
 denoised
 denoises
@@ -599,6 +629,7 @@ embeddings
 enablement
 encodings
 endfor
+endif
 endpgm
 enqueue
 env
@@ -641,6 +672,7 @@ hipSPARSELt
 hipTensor
 hipamd
 hipblas
+hipcc
 hipcub
 hipfft
 hipfort
@@ -670,6 +702,7 @@ installable
 interop
 interprocedural
 intra
+intrinsics
 invariants
 invocating
 ipo
@@ -688,17 +721,20 @@ linearized
 linter
 linux
 llvm
+lm
 localscratch
 logits
 lossy
 macOS
 matchers
+megatron
 microarchitecture
 migraphx
 migratable
 miopen
 miopengemm
 mivisionx
+mixtral
 mjx
 mkdir
 mlirmiopen
@@ -763,6 +799,7 @@ quantile
 quantizer
 quasirandom
 queueing
+qwen
 radeon
 rccl
 rdc
@@ -771,6 +808,7 @@ reStructuredText
 redirections
 refactorization
 reformats
+reinforcememt
 repo
 repos
 representativeness
@@ -778,6 +816,7 @@ req
 resampling
 rescaling
 reusability
+RLHF
 roadmap
 roc
 rocAL
@@ -815,6 +854,7 @@ roctracer
 rst
 runtime
 runtimes
+ResNet
 sL
 scalability
 scalable
@@ -830,6 +870,7 @@ sm
 smi
 softmax
 spack
+spmm
 src
 stochastically
 strided
@@ -838,8 +879,10 @@ subdirectory
 subexpression
 subfolder
 subfolders
+submatrix
 submodule
 submodules
+subnet
 supercomputing
 symlink
 symlinks
@@ -861,6 +904,7 @@ torchvision
 tqdm
 tracebacks
 txt
+TopK
 uarch
 uncached
 uncacheable
@@ -888,6 +932,7 @@ vectorize
 vectorized
 vectorizer
 vectorizes
+verl
 virtualize
 virtualized
 vjxb
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,141 @@ This page is a historical overview of changes made to ROCm components. This
 consolidated changelog documents key modifications and improvements across
 different versions of the ROCm software stack and its components.

+## ROCm 6.4.1
+
+See the [ROCm 6.4.1 release notes](https://rocm.docs.amd.com/en/docs-6.4.1/about/release-notes.html)
+for a complete overview of this release.
+
+### **AMD SMI** (25.4.2)
+
+#### Added
+
+* Dumping CPER entries from RAS tool `amdsmi_get_gpu_cper_entries()` to Python and C APIs.
+  - Dumping CPER entries consist of `amdsmi_cper_hdr_t`.
+  - Dumping CPER entries is also enabled in the CLI interface through `sudo amd-smi ras --cper`.
+* `amdsmi_get_gpu_busy_percent` to the C API.
+
+#### Changed
+
+* Modified VRAM display for `amd-smi monitor -v`. 
+
+#### Optimized
+
+* Improved load times for CLI commands when the GPU has multiple parititons.
+
+#### Resolved issues
+
+* Fixed partition enumeration in `amd-smi list -e`, `amdsmi_get_gpu_enumeration_info()`, `amdsmi_enumeration_info_t`, `drm_card`, and `drm_render` fields.
+
+#### Known issues
+
+* When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.
+
+```{note}
+See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
+```
+
+### **HIP** (6.4.1)
+
+#### Added
+
+* New log mask enumeration `LOG_COMGR` enables logging precise code object information.
+
+#### Changed
+
+* HIP runtime uses device bitcode before SPIRV.
+* The implementation of preventing `hipLaunchKernel` latency degradation with number of idle streams is reverted/disabled by default.
+* Stop using `__AMDGCN_WAVEFRONT_SIZE` and `warpSize` as compile-time constants. The `warpSize` variable is no longer `constexpr`, in order to match the CUDA specification.
+  See more details of the `warpSize` change within the [ROCm upcoming changes](#rocm-upcoming-changes).
+
+#### Optimized
+
+* Improved kernel logging includes de-mangling shader names.
+* Refined implementation in HIP APIs `hipEventRecords` and `hipStreamWaitEvent` for performance improvement.
+
+#### Resolved issues
+
+* Stale state during the graph capture. The return error was fixed, HIP runtime now always uses the latest dependent nodes during `hipEventRecord` capture.
+* Segmentation fault during kernel execution. HIP runtime now allows maximum stack size as per ISA on the GPU device.
+
+### **hipBLASLt** (0.12.1)
+
+#### Resolved issues
+
+* Fixed an accuracy issue for some solutions using an `FP32` or `TF32` data type with a TT transpose.
+
+### **RCCL** (2.22.3)
+
+#### Changed
+
+* MSCCL++ is now disabled by default. To enable it, set `RCCL_MSCCLPP_ENABLE=1`.
+
+#### Resolved issues
+
+* Fixed an issue where early termination, in rare circumstances, could cause the application to stop responding by adding synchronization before destroying a proxy thread.
+* Fixed the accuracy issue for the MSCCLPP `allreduce7` kernel in graph mode.
+
+#### Known issues
+
+* When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault. The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
+  This issue will be fixed in a future ROCm release.
+
+* Within the RCCL-UnitTests test suite, failures occur in tests ending with the
+  `.ManagedMem` and `.ManagedMemGraph` suffixes. These failures only affect the
+  test results and do not affect the RCCL component itself. This issue will be
+  resolved in a future ROCm release.
+
+### **rocALUTION** (3.2.3)
+
+#### Added
+
+* The `-a` option has been added to the `rmake.py` build script. This option allows you to select specific architectures when building on Microsoft Windows.
+
+#### Resolved issues
+
+* Fixed an issue where the `HIP_PATH` environment variable was being ignored when compiling on Microsoft Windows.
+
+### **ROCm Data Center Tool** (0.3.0)
+
+#### Added
+
+- Support for GPU partitions.
+- `RDC_FI_GPU_BUSY_PERCENT` metric.
+
+#### Changed
+
+- Updated `rdc_field` to align with `rdc_bootstrap` for current metrics.
+
+#### Resolved issues
+
+- Fixed [ROCProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.4.0/index.html) eval metrics and memory leaks.
+
+### **ROCm SMI** (7.5.0)
+
+#### Resolved issues
+
+- Fixed partition enumeration. It now refers to the correct DRM Render and Card paths.
+
+```{note}
+See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
+```
+
+### **ROCm Systems Profiler** (1.0.1)
+
+#### Added 
+
+* How-to document for [network performance profiling](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/nic-profiling.html) for standard Network Interface Cards (NICs).
+
+#### Resolved issues
+
+* Fixed a build issue with Dyninst on GCC 13.
+
+### **ROCr Runtime** (1.15.0)
+
+#### Resolved issues
+
+* Fixed a rare occurrence issue on AMD Instinct MI25, MI50, and MI100 GPUs, where the `SDMA` copies might start before the dependent Kernel finishes and could cause memory corruption.
+
 ## ROCm 6.4.0

 See the [ROCm 6.4.0 release notes](https://rocm.docs.amd.com/en/docs-6.4.0/about/release-notes.html)
@@ -761,6 +896,18 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/rele

 - Fixed an issue where sampling multi-GPU Python workloads caused the system to stop responding.

+### **ROCm Validation Suite** (1.1.0)
+
+#### Added
+
+* Configuration files for MI210.
+* Support for OCP fp8 data type.
+* GPU index-based CLI execution.
+
+#### Changed
+
+* JSON logging with updated schema.
+
 ### **rocPRIM** (3.4.0)

 #### Added
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ The following example shows how to use the repo tool to download the ROCm source
 ```bash
 mkdir -p ~/ROCm/
 cd ~/ROCm/
-export ROCM_VERSION=6.4.0
+export ROCM_VERSION=6.4.1
 ~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
 ~/bin/repo sync
 ```
@@ -77,7 +77,7 @@ The Build time will reduce significantly if we limit the GPU Architecture/s agai

 mkdir -p ~/WORKSPACE/      # Or any folder name other than WORKSPACE
 cd ~/WORKSPACE/
-export ROCM_VERSION=6.4.0
+export ROCM_VERSION=6.4.1
 ~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
 ~/bin/repo sync

@@ -127,6 +127,7 @@ bash install-prerequisites.sh
 export GPU_ARCHS="gfx942"               # Example
 export GPU_ARCHS="gfx940;gfx941;gfx942" # Example

+cd ~/WORKSPACE/
 # Pick and run build commands in the docker container:
 # Build rocm-dev packages
 make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
--- a/RELEASE.md
+++ b/RELEASE.md
--- a/default.xml
+++ b/default.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.0"
+    <default revision="refs/tags/rocm-6.4.1"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -1,121 +1,129 @@
-ROCm Version,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
-      ,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
-      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
-      ,"Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
-,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
-,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
-      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
-      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
-      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
-      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
-      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
-      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
-      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
-,,,,,,,,,,,,,,,
-      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
-      ,,,,,,,,,,,,,,,
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
-      ,,,,,,,,,,,,,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      Thrust,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      CUB,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-,,,,,,,,,,,,,,,
-      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      KMD versions,"6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
-      ,,,,,,,,,,,,,,,
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
-      :doc:`rocAL <rocal:index>`,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
-      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`RPP <rpp:index>`,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
-      ,,,,,,,,,,,,,,,
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      :doc:`RCCL <rccl:index>`,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-`rocSHMEM <https://github.com/ROCm/rocSHMEM>`_,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      ,,,,,,,,,,,,,,,
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
-      :doc:`hipfort <hipfort:index>`,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
-      :doc:`rocRAND <rocrand:index>`,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
-      :doc:`Tensile <tensile:src/index>`,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
-      ,,,,,,,,,,,,,,,
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      :doc:`hipCUB <hipcub:index>`,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      ,,,,,,,,,,,,,,,
-      SUPPORT LIBS,,,,,,,,,,,,,,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
-      ,,,,,,,,,,,,,,,
-      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
-      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
-      ,,,,,,,,,,,,,,,
-      PERFORMANCE TOOLS,,,,,,,,,,,,,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
-      ,,,,,,,,,,,,,,,
-      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,
-      :doc:`HIPIFY <hipify:index>`,19.0.0.25104,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
-      ,,,,,,,,,,,,,,,
-      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
-      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-,,,,,,,,,,,,,,,
-      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      :doc:`HIP <hip:index>`,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
+ROCm Version,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
+      ,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,"Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
+      ,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
+      ,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
+      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,RDNA4,,,,,,,,,,,,,,,
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+,,,,,,,,,,,,,,,,
+      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+,,,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,
+      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      ,,,,,,,,,,,,,,,,
+      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      Thrust,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      CUB,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+,,,,,,,,,,,,,,,,
+      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      ,,,,,,,,,,,,,,,,
+      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      ,,,,,,,,,,,,,,,,
+      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      ,,,,,,,,,,,,,,,,
+      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      ,,,,,,,,,,,,,,,,
+      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,,,,,,,,,,,
+      SUPPORT LIBS,,,,,,,,,,,,,,,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      ,,,,,,,,,,,,,,,,
+      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      :doc:`AMD SMI <amdsmi:index>`,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      ,,,,,,,,,,,,,,,,
+      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      ,,,,,,,,,,,,,,,,
+      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,
+      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      ,,,,,,,,,,,,,,,,
+      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+,,,,,,,,,,,,,,,,
+      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`HIP <hip:index>`,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -23,127 +23,133 @@ compatibility and system requirements.
 .. container:: format-big-table

  .. csv-table::
-      :header: "ROCm Version", "6.4.0", "6.3.3", "6.2.0"
+      :header: "ROCm Version", "6.4.1", "6.4.0", "6.3.0"
      :stub-columns: 1

-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4"
-      ,"RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3"
-      ,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9"
-      ,"SLES 15 SP6","SLES 15 SP6, SP5","SLES 15 SP6, SP5"
-      ,"Oracle Linux 9, 8 [#mi300x]_",Oracle Linux 8.10 [#mi300x]_,Oracle Linux 8.9 [#mi300x]_
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
+      ,"RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10
+      ,SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5"
+      ,"Oracle Linux 9, 8 [#mi300x]_","Oracle Linux 9, 8 [#mi300x]_",Oracle Linux 8.10 [#mi300x]_
      ,Debian 12 [#single-node]_,Debian 12 [#single-node]_,
      ,Azure Linux 3.0 [#mi300x]_,Azure Linux 3.0 [#mi300x]_,
      ,.. _architecture-support-compatibility-matrix:,,
      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
      ,CDNA2,CDNA2,CDNA2
      ,CDNA,CDNA,CDNA
+      ,RDNA4,,
      ,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix:,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS]_,,
+      ,gfx1200 [#RDNA-OS]_,,
+      ,gfx1101 [#RDNA-OS]_,,
+      ,gfx1100,gfx1100,gfx1100
      ,gfx1030,gfx1030,gfx1030
-      ,gfx942,gfx942,gfx942 [#mi300_620]_
+      ,gfx942,gfx942,gfx942
      ,gfx90a,gfx90a,gfx90a
      ,gfx908,gfx908,gfx908
      ,,,
      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.3, 2.2, 2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.31,0.4.26
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.20,1.17.3,1.17.3
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31  
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,85f95ae
+      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`,N/A,2.4.0,N/A
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3
      ,,,
      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0
      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0
      ,,,
      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
-      Thrust,2.5.0,2.3.2,2.2.0
-      CUB,2.5.0,2.3.2,2.2.0
+      Thrust,2.5.0,2.5.0,2.3.2
+      CUB,2.5.0,2.5.0,2.3.2
      ,,,
      KMD & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
-      KMD versions,"6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x"
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
      ,,,
      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.11.0,2.10.0
-      :doc:`MIOpen <miopen:index>`,3.4.0,3.3.0,3.2.0
-      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.1.0,3.0.0
-      :doc:`rocAL <rocal:index>`,2.2.0,2.1.0,1.0.0
-      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.8.0,0.6.0
-      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.6.0,N/A
-      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.2.0,0.1.0
-      :doc:`RPP <rpp:index>`,1.9.10,1.9.1,1.8.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.11.0
+      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.1.0
+      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.1.0
+      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.8.0
+      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.6.0
+      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.2.0
+      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.1
      ,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
-      :doc:`RCCL <rccl:index>`,2.22.3,2.21.5,2.20.5
-      `rocSHMEM <https://github.com/ROCm/rocSHMEM>`_ ,2.0.0,N/A,N/A
+      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.0,2.0.0,N/A
      ,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.3.0,2.2.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.12.0,0.10.0,0.8.0
-      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.17,1.0.14
-      :doc:`hipfort <hipfort:index>`,0.6.0,0.5.1,0.4.0
-      :doc:`hipRAND <hiprand:index>`,2.12.0,2.11.1,2.11.0
-      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.3.0,2.2.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.1.2,3.1.1
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.2,0.2.1
-      :doc:`rocALUTION <rocalution:index>`,3.2.2,3.2.1,3.2.0
-      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.3.0,4.2.0
-      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.31,1.0.28
-      :doc:`rocRAND <rocrand:index>`,3.3.0,3.2.0,3.1.0
-      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.27.0,3.26.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.3.0,3.2.0
-      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.6.0,1.5.0
-      :doc:`Tensile <tensile:src/index>`,4.43.0,4.42.0,4.41.0
+      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.3.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.0,0.10.0
+      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.17
+      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.5.0
+      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.11.0
+      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.3.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.1.2
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.2
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.2,3.2.1
+      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.4.0,4.3.0
+      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.31
+      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.2.0
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.28.0,3.27.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.6.0
+      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.42.0
      ,,,
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
-      :doc:`hipCUB <hipcub:index>`,3.4.0,3.3.0,3.2.0
-      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.4.0,1.3.0
-      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.3.0,3.2.0
-      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.0.1
+      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.4.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0
      ,,,
      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.4.43482,6.3.42134,6.2.41133
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.0,6.3.3,6.2.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,20240607.1.4246
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43482,6.3.42131
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.1,6.4.0,6.3.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,25.3.0,24.7.1,24.6.2
+      :doc:`AMD SMI <amdsmi:index>`,25.4.2,25.3.0,24.7.1
      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.4.0,7.3.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.0.60200
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.5.0,7.4.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0
      ,,,
      PERFORMANCE TOOLS,,,
      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.0.0,2.0.1
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.0,0.1.2,1.11.2
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60400,2.0.60303,2.0.60200
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.5.0,0.4.0
-      :doc:`ROCTracer <roctracer:index>`,4.1.60400,4.1.60303,4.1.60200
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.1.0,3.0.0
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.1,1.0.0,0.1.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60401,2.0.60400,2.0.60300
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.5.0
+      :doc:`ROCTracer <roctracer:index>`,4.1.60401,4.1.60400,4.1.60300
      ,,,
      DEVELOPMENT TOOLS,,,
-      :doc:`HIPIFY <hipify:index>`,19.0.0.25133,18.0.0.25012,18.0.0.24232
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.13.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.0,0.76.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,14.2.0
+      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,18.0.0.24455
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0
      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.3,2.0.3
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.3
      ,,,
      COMPILERS,.. _compilers-support-compatibility-matrix:,,
      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25133,18.0.0.25012,18.0.0.24232
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25133,18.0.0.25012,18.0.0.24232
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25133,18.0.0.25012,18.0.0.24232
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25184,19.0.0.25133,18.0.0.24455
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25184,19.0.0.25133,18.0.0.24491
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25184,19.0.0.25133,18.0.0.24491
      ,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43482,6.3.42134,6.2.41133
-      :doc:`HIP <hip:index>`,6.4.43482,6.3.42134,6.2.41133
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43483,6.4.43482,6.3.42131
+      :doc:`HIP <hip:index>`,6.4.43483,6.4.43482,6.3.42131
      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.14.0,1.13.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.14.0


 .. rubric:: Footnotes
@@ -151,8 +157,9 @@ compatibility and system requirements.
 .. [#mi300x] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
 .. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality. 
 .. [#mi300_620] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-.. [#kfd_support] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
+.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
+.. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.

 .. _OS-kernel-versions:

@@ -170,7 +177,8 @@ Use this lookup table to confirm which operating system and kernel versions are
   ,,
   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 GA, 6.8 HWE", 2.35
   ,,
-   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.5, 5.14+, 2.34
+   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14+, 2.34
+   , 9.5, 5.14+, 2.34
   ,9.4, 5.14+, 2.34
   ,9.3, 5.14+, 2.34
   ,,
@@ -229,5 +237,9 @@ Expand for full historical view of:
   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
-   .. [#kfd_support-past-60] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
+   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
+   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
+   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
+   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
+   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -0,0 +1,255 @@
+:orphan:
+
+.. meta::
+    :description: Deep Graph Library (DGL) compatibility
+    :keywords: GPU, DGL compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+DGL compatibility
+********************************************************************************
+
+Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable 
+Python package for deep learning on graphs. DGL is framework agnostic, meaning 
+if a deep graph model is a component in an end-to-end application, the rest of 
+the logic is implemented using PyTorch.  
+
+* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository. 
+* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository. 
+* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
+* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` 
+  to install and get started.
+
+
+Supported devices
+================================================================================
+
+- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
+- **Partially Supported**: TF32 with AMD Instinct MI250X
+
+
+.. _dgl-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+DGL can be used for Graph Learning, and building popular graph models like  
+GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:
+
+- Recommender systems
+- Network Optimization and Analysis
+- 1D (Temporal) and 2D (Image) Classification
+- Drug Discovery
+
+Multiple use cases of DGL have been tested and verified.
+However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
+Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
+where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs. 
+
+Coverage includes:
+
+- Single-GPU training/inference
+- Multi-GPU training
+
+
+.. _dgl-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
+Click the |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: DGL Docker image components
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker
+      - DGL
+      - PyTorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
+      - 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      
+
+Key ROCm libraries for DGL
+================================================================================
+
+DGL on ROCm depends on specific libraries that affect its features and performance.
+Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
+If you prefer to build it yourself, ensure the following dependencies are installed:
+
+.. list-table:: 
+    :header-rows: 1
+
+    * - ROCm library
+      - Version
+      - Purpose
+    * - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
+      - :version-ref:`"Composable Kernel" rocm_version`
+      - Enables faster execution of core operations like matrix multiplication
+        (GEMM), convolutions and transformations.
+    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
+      - :version-ref:`hipBLAS rocm_version`
+      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
+        matrix and vector operations.
+    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
+      - :version-ref:`hipBLASLt rocm_version`
+      - hipBLASLt is an extension of the hipBLAS library, providing additional
+        features like epilogues fused into the matrix multiplication kernel or
+        use of integer tensor cores.
+    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
+      - :version-ref:`hipCUB rocm_version`
+      - Provides a C++ template library for parallel algorithms for reduction,
+        scan, sort and select.
+    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
+      - :version-ref:`hipFFT rocm_version`
+      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
+    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
+      - :version-ref:`hipRAND rocm_version`
+      - Provides fast random number generation for GPUs.
+    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
+      - :version-ref:`hipSOLVER rocm_version`
+      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
+        singular value decompositions (SVD).
+    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
+      - :version-ref:`hipSPARSE rocm_version`
+      - Accelerates operations on sparse matrices, such as sparse matrix-vector
+        or matrix-matrix products.
+    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
+      - :version-ref:`hipSPARSELt rocm_version`
+      - Accelerates operations on sparse matrices, such as sparse matrix-vector
+        or matrix-matrix products.
+    * - `hipTensor <https://github.com/ROCm/hipTensor>`_
+      - :version-ref:`hipTensor rocm_version`
+      - Optimizes for high-performance tensor operations, such as contractions.
+    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
+      - :version-ref:`MIOpen rocm_version`
+      - Optimizes deep learning primitives such as convolutions, pooling,
+        normalization, and activation functions.
+    * - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
+      - :version-ref:`MIGraphX rocm_version`
+      - Adds graph-level optimizations, ONNX models and mixed precision support
+        and enable Ahead-of-Time (AOT) Compilation.
+    * - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
+      - :version-ref:`MIVisionX rocm_version`
+      - Optimizes acceleration for computer vision and AI workloads like
+        preprocessing, augmentation, and inferencing.
+    * - `rocAL <https://github.com/ROCm/rocAL>`_
+      - :version-ref:`rocAL rocm_version`
+      - Accelerates the data pipeline by offloading intensive preprocessing and
+        augmentation tasks. rocAL is part of MIVisionX.
+    * - `RCCL <https://github.com/ROCm/rccl>`_
+      - :version-ref:`RCCL rocm_version`
+      - Optimizes for multi-GPU communication for operations like AllReduce and
+        Broadcast.
+    * - `rocDecode <https://github.com/ROCm/rocDecode>`_
+      - :version-ref:`rocDecode rocm_version`
+      - Provides hardware-accelerated data decoding capabilities, particularly
+        for image, video, and other dataset formats.
+    * - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
+      - :version-ref:`rocJPEG rocm_version`
+      - Provides hardware-accelerated JPEG image decoding and encoding.
+    * - `RPP <https://github.com/ROCm/RPP>`_
+      - :version-ref:`RPP rocm_version`
+      - Speeds up data augmentation, transformation, and other preprocessing steps.
+    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
+      - :version-ref:`rocThrust rocm_version`
+      - Provides a C++ template library for parallel algorithms like sorting,
+        reduction, and scanning.
+    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
+      - :version-ref:`rocWMMA rocm_version`
+      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
+        multiplication (GEMM) and accumulation operations with mixed precision
+        support.
+
+
+Supported features
+================================================================================
+
+Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
+Instead of listing them all, support is grouped into the following categories to provide a general overview. 
+
+* DGL Base
+* DGL Backend 
+* DGL Data
+* DGL Dataloading
+* DGL DGLGraph
+* DGL Function
+* DGL Ops
+* DGL Sampling
+* DGL Transforms
+* DGL Utils
+* DGL Distributed
+* DGL Geometry
+* DGL Mpops
+* DGL NN
+* DGL Optim
+* DGL Sparse
+
+
+Unsupported features
+================================================================================
+
+* Graphbolt
+* Partial TF32 Support (MI250x only)
+* Kineto/ ROCTracer integration
+
+
+Unsupported functions
+================================================================================
+
+* ``more_nnz``
+* ``format``
+* ``multiprocess_sparse_adam_state_dict``
+* ``record_stream_ndarray``
+* ``half_spmm``
+* ``segment_mm`` 
+* ``gather_mm_idx_b``
+* ``pgexplainer``
+* ``sample_labors_prob``
+* ``sample_labors_noprob``
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -53,7 +53,7 @@ Use cases and recommendations
 * The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
  blog explores the implementation and training of a Generative Pre-trained
  Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s JAX-based
-  nanoGPT. Comparing how essential GPT components—such as self-attention 
+  nanoGPT. Comparing how essential GPT components—such as self-attention
  mechanisms and optimizers—are realized in JAX and JAX, also highlights
  JAX’s unique features.

@@ -97,7 +97,7 @@ Docker image compatibility
 AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
 with ROCm backends on Docker Hub. The following Docker image tags and
 associated inventories represent the latest JAX version from the official Docker Hub and are validated for
-`ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_. Click the |docker-icon|
+`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
 icon to view the image on Docker Hub.

 .. list-table:: JAX Docker image components
@@ -110,19 +110,19 @@ icon to view the image on Docker Hub.

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4-jax0.4.35-py3.12/images/sha256-4069398229078f3311128b6d276c6af377c7e97d3363d020b0bf7154fae619ca"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.1-jax0.4.35-py3.12/images/sha256-7a0745a2a2758bdf86397750bac00e9086cbf67d170cfdbb08af73f7c7d18a6a"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>

      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
      - Ubuntu 24.04
-      - `3.12.7 <https://www.python.org/downloads/release/python-3127/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4-jax0.4.35-py3.10/images/sha256-a137f901f91ce6c13b424c40a6cf535248d4d20fd36d5daf5eee0570190a4a11"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.1-jax0.4.35-py3.10/images/sha256-5f9e8d6e6e69fdc9a1a3f2ba3b1234c3f46c53b7468538c07fd18b00899da54f"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>

      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
      - Ubuntu 22.04
-      - `3.10.14 <https://www.python.org/downloads/release/python-31014/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_

 AMD publishes `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
 with ROCm backends on Docker Hub. The following Docker image tags and
@@ -160,12 +160,14 @@ associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/
      - Ubuntu 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_

+.. _key_rocm_libraries:
+
 Key ROCm libraries for JAX
 ================================================================================

-JAX functionality on ROCm is determined by its underlying library
-dependencies. These ROCm components affect the capabilities, performance, and
-feature set available to developers.
+The following ROCm libraries represent potential targets that could be utilized
+by JAX on ROCm for various computational tasks. The actual libraries used will
+depend on the specific implementation and operations performed.

 .. list-table::
    :header-rows: 1
@@ -173,345 +175,140 @@ feature set available to developers.
    * - ROCm library
      - Version
      - Purpose
-      - Used in
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
      - :version-ref:`hipBLAS rocm_version`
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
-      - Matrix multiplication in ``jax.numpy.matmul``, ``jax.lax.dot`` and
-        ``jax.lax.dot_general``, operations like ``jax.numpy.dot``, which
-        involve vector and matrix computations and batch matrix multiplications
-        ``jax.numpy.einsum`` with matrix-multiplication patterns algebra
-        operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
      - :version-ref:`hipBLASLt rocm_version`
      - hipBLASLt is an extension of hipBLAS, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
-      - Matrix multiplication in ``jax.numpy.matmul`` or ``jax.lax.dot``, and
-        the XLA (Accelerated Linear Algebra) use hipBLASLt for optimized matrix
-        operations, mixed-precision support, and hardware-specific
-        optimizations.
    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
      - :version-ref:`hipCUB rocm_version`
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
-      - Reduction functions (``jax.numpy.sum``, ``jax.numpy.mean``,
-        ``jax.numpy.prod``, ``jax.numpy.max`` and ``jax.numpy.min``), prefix sum
-        (``jax.numpy.cumsum``, ``jax.numpy.cumprod``) and sorting
-        (``jax.numpy.sort``, ``jax.numpy.argsort``).
    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
      - :version-ref:`hipFFT rocm_version`
      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
-      - Used in functions like ``jax.numpy.fft``.
    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
      - :version-ref:`hipRAND rocm_version`
      - Provides fast random number generation for GPUs.
-      - The ``jax.random.uniform``, ``jax.random.normal``,
-        ``jax.random.randint`` and ``jax.random.split``.
    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
      - :version-ref:`hipSOLVER rocm_version`
      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
        singular value decompositions (SVD).
-      - Solving linear systems (``jax.numpy.linalg.solve``), matrix
-        factorizations, SVD (``jax.numpy.linalg.svd``) and eigenvalue problems
-        (``jax.numpy.linalg.eig``).
    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
      - :version-ref:`hipSPARSE rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
-      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
-        matrix-vector and matrix-matrix products
-        (``jax.experimental.sparse.dot``), sparse linear system solvers and
-        sparse data handling.
    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
      - :version-ref:`hipSPARSELt rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
-      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
-        matrix-vector and matrix-matrix products
-        (``jax.experimental.sparse.dot``) and sparse linear system solvers.
    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
      - :version-ref:`MIOpen rocm_version`
      - Optimized for deep learning primitives such as convolutions, pooling,
        normalization, and activation functions.
-      - Speeds up convolutional neural networks (CNNs), recurrent neural
-        networks (RNNs), and other layers. Used in operations like
-        ``jax.nn.conv``, ``jax.nn.relu``, and ``jax.nn.batch_norm``.
    * - `RCCL <https://github.com/ROCm/rccl>`_
      - :version-ref:`RCCL rocm_version`
      - Optimized for multi-GPU communication for operations like  all-reduce,
        broadcast, and scatter.
-      - Distribute computations across multiple GPU with ``pmap`` and
-        ``jax.distributed``. XLA automatically uses rccl when executing
-        operations across multiple GPUs on AMD hardware.
    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
      - :version-ref:`rocThrust rocm_version`
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
-      - Reduction operations like ``jax.numpy.sum``, ``jax.pmap`` for
-        distributed training, which involves parallel reductions or
-        operations like ``jax.numpy.cumsum`` can use rocThrust.

-Supported features
+.. note::
+
+    This table shows ROCm libraries that could potentially be utilized by JAX. Not
+    all libraries may be used in every configuration, and the actual library usage
+    will depend on the specific operations and implementation details.
+
+Supported data types and modules
 ===============================================================================

-The following table maps the public JAX API modules to their supported
-ROCm and JAX versions.
+The following tables lists the supported public JAX API data types and modules.

-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - Description
-      - As of JAX
-      - As of ROCm
-    * - ``jax.numpy``
-      - Implements the NumPy API, using the primitives in ``jax.lax``.
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy``
-      - Provides GPU-accelerated and differentiable implementations of many
-        functions from the SciPy library, leveraging JAX's transformations
-        (e.g., ``grad``, ``jit``, ``vmap``).
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.lax``
-      - A library of primitives operations that underpins libraries such as
-        ``jax.numpy.`` Transformation rules, such as Jacobian-vector product
-        (JVP) and batching rules, are typically defined as transformations on
-        ``jax.lax`` primitives.
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.random``
-      - Provides a number of routines for deterministic generation of sequences
-        of pseudorandom numbers.
-      - 0.1.58
-      - 5.0.0
-    * - ``jax.sharding``
-      - Allows to define partitioning and distributing arrays across multiple
-        devices.
-      - 0.3.20
-      - 5.1.0
-    * - ``jax.distributed``
-      - Enables the scaling of computations across multiple devices on a single
-        machine or across multiple machines.
-      - 0.1.74
-      - 5.0.0
-    * - ``jax.image``
-      - Contains image manipulation functions like resize, scale and translation.
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.nn``
-      - Contains common functions for neural network libraries.
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.ops``
-      - Computes the minimum, maximum, sum or product within segments of an
-        array.
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.stages``
-      - Contains interfaces to stages of the compiled execution process.
-      - 0.3.4
-      - 5.0.0
-    * - ``jax.extend``
-      - Provides modules for access to JAX internal machinery module. The
-        ``jax.extend`` module defines a library view of some of JAX’s internal
-        components.
-      - 0.4.15
-      - 5.5.0
-    * - ``jax.example_libraries``
-      - Serves as a collection of example code and libraries that demonstrate
-        various capabilities of JAX.
-      - 0.1.74
-      - 5.0.0
-    * - ``jax.experimental``
-      - Namespace for experimental features and APIs that are in development or
-        are not yet fully stable for production use.
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.lib``
-      - Set of internal tools and types for bridging between JAX’s Python
-        frontend and its XLA backend.
-      - 0.4.6
-      - 5.3.0
-    * - ``jax_triton``
-      - Library that integrates the Triton deep learning compiler with JAX.
-      - jax_triton 0.2.0
-      - 6.2.4
-
-jax.scipy module
-------------------------------------------------------------------------------
-
-A SciPy-like API for scientific computing.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - As of JAX
-      - As of ROCm
-    * - ``jax.scipy.cluster``
-      - 0.3.11
-      - 5.1.0
-    * - ``jax.scipy.fft``
-      - 0.1.71
-      - 5.0.0
-    * - ``jax.scipy.integrate``
-      - 0.4.15
-      - 5.5.0
-    * - ``jax.scipy.interpolate``
-      - 0.1.76
-      - 5.0.0
-    * - ``jax.scipy.linalg``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.ndimage``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.optimize``
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.scipy.signal``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.spatial.transform``
-      - 0.4.12
-      - 5.4.0
-    * - ``jax.scipy.sparse.linalg``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.special``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.stats``
-      - 0.1.56
-      - 5.0.0
-
-jax.scipy.stats module
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-   :header-rows: 1
-
-   * - Module
-     - As of JAX
-     - As of ROCm
-   * - ``jax.scipy.stats.bernouli``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.beta``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.betabinom``
-     - 0.1.61
-     - 5.0.0
-   * - ``jax.scipy.stats.binom``
-     - 0.4.14
-     - 5.4.0
-   * - ``jax.scipy.stats.cauchy``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.chi2``
-     - 0.1.61
-     - 5.0.0
-   * - ``jax.scipy.stats.dirichlet``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.expon``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.gamma``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.gennorm``
-     - 0.3.15
-     - 5.2.0
-   * - ``jax.scipy.stats.geom``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.laplace``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.logistic``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.multinomial``
-     - 0.3.18
-     - 5.1.0
-   * - ``jax.scipy.stats.multivariate_normal``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.nbinom``
-     - 0.1.72
-     - 5.0.0
-   * - ``jax.scipy.stats.norm``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.pareto``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.poisson``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.t``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.truncnorm``
-     - 0.4.0
-     - 5.3.0
-   * - ``jax.scipy.stats.uniform``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.vonmises``
-     - 0.4.2
-     - 5.3.0
-   * - ``jax.scipy.stats.wrapcauchy``
-     - 0.4.20
-     - 5.6.0
-
-jax.extend module
-------------------------------------------------------------------------------
-
-Modules for JAX extensions.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - As of JAX
-      - As of ROCm
-    * - ``jax.extend.ffi``
-      - 0.4.30
-      - 6.0.0
-    * - ``jax.extend.linear_util``
-      - 0.4.17
-      - 5.6.0
-    * - ``jax.extend.mlir``
-      - 0.4.26
-      - 5.6.0
-    * - ``jax.extend.random``
-      - 0.4.15
-      - 5.5.0
-
-Unsupported JAX features
+Supported data types
 --------------------------------------------------------------------------------

-The following GPU-accelerated JAX features are not supported by ROCm for
-the listed supported JAX versions.
+ROCm supports all the JAX data types of `jax.dtypes <https://docs.jax.dev/en/latest/jax.dtypes.html>`_
+module, `jax.numpy.dtype <https://docs.jax.dev/en/latest/_autosummary/jax.numpy.dtype.html>`_
+and `default_dtype <https://docs.jax.dev/en/latest/default_dtypes.html>`_ .
+The ROCm supported data types in JAX are collected in the following table.

 .. list-table::
    :header-rows: 1

-    * - Feature
+    * - Data type
      - Description
-    * - Mixed Precision with TF32
-      - Mixed precision with TF32 is used for matrix multiplications,
-        convolutions, and other linear algebra operations, particularly in
-        deep learning workloads like CNNs and transformers.
-    * - XLA int4 support
-      - 4-bit integer (int4) precision in the XLA compiler.
-    * - MOSAIC (GPU)
-      - Mosaic is a library of kernel-building abstractions for JAX's Pallas system
-      - Not Supported
+
+    * - ``bfloat16``
+      - 16-bit bfloat (brain floating point).
+
+    * - ``bool``
+      - Boolean.
+
+    * - ``complex128``
+      - 128-bit complex.
+
+    * - ``complex64``
+      - 64-bit complex.
+
+    * - ``float16``
+      - 16-bit (half precision) floating-point.
+
+    * - ``float32``
+      - 32-bit (single precision) floating-point.
+
+    * - ``float64``
+      - 64-bit (double precision) floating-point.
+
+    * - ``half``
+      - 16-bit (half precision) floating-point.
+
+    * - ``int16``
+      - Signed 16-bit integer.
+
+    * - ``int32``
+      - Signed 32-bit integer.
+
+    * - ``int64``
+      - Signed 64-bit integer.
+
+    * - ``int8``
+      - Signed 8-bit integer.
+
+    * - ``uint16``
+      - Unsigned 16-bit (word) integer.
+
+    * - ``uint32``
+      - Unsigned 32-bit (dword) integer.
+
+    * - ``uint64``
+      - Unsigned 64-bit (qword) integer.
+
+    * - ``uint8``
+      - Unsigned 8-bit (byte) integer.
+
+.. note::
+
+  JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
+  collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
+  page.
+
+Supported modules
+--------------------------------------------------------------------------------
+
+For a complete and up-to-date list of JAX public modules (for example, ``jax.numpy``,
+``jax.scipy``, ``jax.lax``), their descriptions, and usage, please refer directly to the
+`official JAX API documentation <https://jax.readthedocs.io/en/latest/jax.html>`_.
+
+.. note::
+
+  Since version 0.1.56, JAX has full support for ROCm, and the
+  :ref:`Known issues and important notes <jax_comp_known_issues>` section
+  contains details about limitations specific to the ROCm backend. The list of
+  JAX API modules is maintained by the JAX project and is subject to change. 
+  Refer to the official Jax documentation for the most up-to-date information.
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -95,7 +95,7 @@ Docker image compatibility

 AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`_
 with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
+inventories were tested on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_.
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: PyTorch Docker image components
@@ -116,137 +116,122 @@ Click |docker-icon| to view the image on Docker Hub.

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-ab1d350b818b90123cfda31363019d11c0d41a8f12a19e3cb2cb40cf0261137d"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-c76af9bfb1c25b0f40d4c29e8652105c57250bf018d23ff595b06bd79666fdd7"><i class="fab fa-docker fa-lg"></i></a>

      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`_
      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `1.16.0 <https://github.com/openucx/ucx/tree/v1.16.0>`_
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-130536fdfceb374626a7bcb8d00b9d796ddfc3115677d51229e5b852d96b5ef4"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-f9d226135d51831c810dcb1251636ec61f85c65fcdda03e188c053a5d4f6585b"><i class="fab fa-docker fa-lg"></i></a>

      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`_
      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-      - `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-20a2e24b4738dc1f1a44a04f23827918b56c99f7e697e6fccb90e9c4fae8ca9b"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-3490e74d4f43dcdb3351dd334108d1ccd47e5a687c0523a2424ac1bcdd3dd6dd"><i class="fab fa-docker fa-lg"></i></a>

      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-      - `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.11_pytorch_release_2.5.1/images/sha256-f09cb8ca39cc39222fb554060711f5c19130f7b4047aaf41fad4ba3ec470ca03"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-26c5dfffb4a54625884abca83166940f17dd27bc75f1b24f6e80fbcb7d4e9afb"><i class="fab fa-docker fa-lg"></i></a>

      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
      - 22.04
-      - `3.11.9 <https://www.python.org/downloads/release/python-3119/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
-      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-a91c100d1fe608dae3eb7f60a751630363d4027ac3d077d428e92945204c338e"><i class="fab fa-docker fa-lg"></i></a>
-
-      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
-      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
-      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
-      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
-      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
-      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
-      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
-      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-66a89ce6485bb887af74bb9bd76bb613ab9834a6b1374649ea7ae379883454a4"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-f378a24561fa6efc178b6dc93fc7d82e5b93653ecd59c89d4476674d29e1284d"><i class="fab fa-docker fa-lg"></i></a>

      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`_
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-c716cf167e6e49893f11de03606ed37044153aca089e74ca615065c06877f86b"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-2308dbd0e650b7bf8d548575cbb6e2bdc021f9386384ce570da16d58ee684d22"><i class="fab fa-docker fa-lg"></i></a>

      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
-      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-0434cbc9b07b2c26e39480d7447f676f9057a1054dcff00e0050c25a6eddbd3c"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-eefd2ab019728f91f94c5e6a9463cb0ea900b3011458d18fe5d88e50c0b57d86"><i class="fab fa-docker fa-lg"></i></a>

      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`_
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-688b1c0073092615fb98778d78b16191e506097ee116a2d3d2628b264d5d367b"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-473643226ab0e93a04720b256ed772619878abf9c42b9f84828cefed522696fd"><i class="fab fa-docker fa-lg"></i></a>

      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

 Key ROCm libraries for PyTorch
@@ -387,24 +372,15 @@ feature set available to developers.
        involve matrix products, such as ``torch.matmul``, ``torch.bmm``, and
        more.

-Supported features
+Supported modules and data types
 ================================================================================

-This section maps GPU-accelerated PyTorch features to their supported ROCm and
-PyTorch versions.
+The following section outlines the supported data types, modules, and domain libraries available in PyTorch on ROCm.

-torch
+Supported data types
 --------------------------------------------------------------------------------

-`torch <https://pytorch.org/docs/stable/index.html>`_ is the central module of
-PyTorch, providing data structures for multi-dimensional tensors and
-implementing mathematical operations on them. It also includes utilities for
-efficient serialization of tensors and arbitrary data types and other tools.
-
-Tensor data types
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The tensor data type is specified using the ``dtype`` attribute or argument. 
+The tensor data type is specified using the ``dtype`` attribute or argument.
 PyTorch supports many data types for different use cases.

 The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`_
@@ -415,539 +391,154 @@ single data types:

    * - Data type
      - Description
-      - As of PyTorch
-      - As of ROCm
    * - ``torch.float8_e4m3fn``
      - 8-bit floating point, e4m3
-      - 2.3
-      - 5.5
    * - ``torch.float8_e5m2``
      - 8-bit floating point, e5m2
-      - 2.3
-      - 5.5
    * - ``torch.float16`` or ``torch.half``
      - 16-bit floating point
-      - 0.1.6
-      - 2.0
    * - ``torch.bfloat16``
      - 16-bit floating point
-      - 1.6
-      - 2.6
    * - ``torch.float32`` or ``torch.float``
      - 32-bit floating point
-      - 0.1.12_2
-      - 2.0
    * - ``torch.float64`` or ``torch.double``
      - 64-bit floating point
-      - 0.1.12_2
-      - 2.0
    * - ``torch.complex32`` or ``torch.chalf``
-      - PyTorch provides native support for 32-bit complex numbers
-      - 1.6
-      - 2.0
+      - 32-bit complex numbers
    * - ``torch.complex64`` or ``torch.cfloat``
-      - PyTorch provides native support for 64-bit complex numbers
-      - 1.6
-      - 2.0
+      - 64-bit complex numbers
    * - ``torch.complex128`` or ``torch.cdouble``
-      - PyTorch provides native support for 128-bit complex numbers
-      - 1.6
-      - 2.0
+      - 128-bit complex numbers
    * - ``torch.uint8``
      - 8-bit integer (unsigned)
-      - 0.1.12_2
-      - 2.0
    * - ``torch.uint16``
-      - 16-bit integer (unsigned)
-      - 2.3
-      - Not natively supported
+      - 16-bit integer (unsigned);
+        Not natively supported in ROCm
    * - ``torch.uint32``
-      - 32-bit integer (unsigned)
-      - 2.3
-      - Not natively supported
+      - 32-bit integer (unsigned);
+        Not natively supported in ROCm
    * - ``torch.uint64``
-      - 32-bit integer (unsigned)
-      - 2.3
-      - Not natively supported
+      - 64-bit integer (unsigned);
+        Not natively supported in ROCm
    * - ``torch.int8``
      - 8-bit integer (signed)
-      - 1.12
-      - 5.0
    * - ``torch.int16`` or ``torch.short``
      - 16-bit integer (signed)
-      - 0.1.12_2
-      - 2.0
    * - ``torch.int32`` or ``torch.int``
      - 32-bit integer (signed)
-      - 0.1.12_2
-      - 2.0
    * - ``torch.int64`` or ``torch.long``
      - 64-bit integer (signed)
-      - 0.1.12_2
-      - 2.0
    * - ``torch.bool``
      - Boolean
-      - 1.2
-      - 2.0
    * - ``torch.quint8``
      - Quantized 8-bit integer (unsigned)
-      - 1.8
-      - 5.0
    * - ``torch.qint8``
      - Quantized 8-bit integer (signed)
-      - 1.8
-      - 5.0
    * - ``torch.qint32``
      - Quantized 32-bit integer (signed)
-      - 1.8
-      - 5.0
    * - ``torch.quint4x2``
      - Quantized 4-bit integer (unsigned)
-      - 1.8
-      - 5.0

 .. note::

-  Unsigned types except ``uint8`` have limited support in eager mode. They
+  Unsigned types, except ``uint8``, have limited support in eager mode. They
  primarily exist to assist usage with ``torch.compile``.

  See :doc:`ROCm precision support <rocm:reference/precision-support>` for the
  native hardware support of data types.

-torch.cuda
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-``torch.cuda`` in PyTorch is a module that provides utilities and functions for
-managing and utilizing AMD and NVIDIA GPUs. It enables GPU-accelerated
-computations, memory management, and efficient execution of tensor operations,
-leveraging ROCm and CUDA as the underlying frameworks.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - Device management
-      - Utilities for managing and interacting with GPUs.
-      - 0.4.0
-      - 3.8
-    * - Tensor operations on GPU
-      - Performs tensor operations such as addition and matrix multiplications on
-        the GPU.
-      - 0.4.0
-      - 3.8
-    * - Streams and events
-      - Streams allow overlapping computation and communication for optimized
-        performance. Events enable synchronization.
-      - 1.6.0
-      - 3.8
-    * - Memory management
-      - Functions to manage and inspect memory usage like
-        ``torch.cuda.memory_allocated()``, ``torch.cuda.max_memory_allocated()``,
-        ``torch.cuda.memory_reserved()`` and ``torch.cuda.empty_cache()``.
-      - 0.3.0
-      - 1.9.2
-    * - Running process lists of memory management
-      - Returns a human-readable printout of the running processes and their GPU
-        memory use for a given device with functions like
-        ``torch.cuda.memory_stats()`` and ``torch.cuda.memory_summary()``.
-      - 1.8.0
-      - 4.0
-    * - Communication collectives
-      - Set of APIs that enable efficient communication between multiple GPUs,
-        allowing for distributed computing and data parallelism.
-      - 1.9.0
-      - 5.0
-    * - ``torch.cuda.CUDAGraph``
-      - Graphs capture sequences of GPU operations to minimize kernel launch
-        overhead and improve performance.
-      - 1.10.0
-      - 5.3
-    * - TunableOp
-      - A mechanism that allows certain operations to be more flexible and
-        optimized for performance. It enables automatic tuning of kernel
-        configurations and other settings to achieve the best possible
-        performance based on the specific hardware (GPU) and workload.
-      - 2.0
-      - 5.4
-    * - NVIDIA Tools Extension (NVTX)
-      - Integration with NVTX for profiling and debugging GPU performance using
-        NVIDIA's Nsight tools.
-      - 1.8.0
-      - ❌
-    * - Lazy loading NVRTC
-      - Delays JIT compilation with NVRTC until the code is explicitly needed.
-      - 1.13.0
-      - ❌
-    * - Jiterator (beta)
-      - Jiterator allows asynchronous data streaming into computation streams
-        during training loops.
-      - 1.13.0
-      - 5.2
-
-.. Need to validate and extend.
-
-torch.backends.cuda
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-``torch.backends.cuda`` is a PyTorch module that provides configuration options
-and flags to control the behavior of ROCm or CUDA operations. It is part of the
-PyTorch backend configuration system, which allows users to fine-tune how
-PyTorch interacts with the ROCm or CUDA environment.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - ``cufft_plan_cache``
-      - Manages caching of GPU FFT plans to optimize repeated FFT computations.
-      - 1.7.0
-      - 5.0
-    * - ``matmul.allow_tf32``
-      - Enables or disables the use of TensorFloat-32 (TF32) precision for
-        faster matrix multiplications on GPUs with Tensor Cores.
-      - 1.10.0
-      - ❌
-    * - ``matmul.allow_fp16_reduced_precision_reduction``
-      - Reduced precision reductions (e.g., with fp16 accumulation type) are
-        allowed with fp16 GEMMs.
-      - 2.0
-      - ❌
-    * - ``matmul.allow_bf16_reduced_precision_reduction``
-      - Reduced precision reductions are allowed with bf16 GEMMs.
-      - 2.0
-      - ❌
-    * - ``enable_cudnn_sdp``
-      - Globally enables cuDNN SDPA's kernels within SDPA.
-      - 2.0
-      - ❌
-    * - ``enable_flash_sdp``
-      - Globally enables or disables FlashAttention for SDPA.
-      - 2.1
-      - ❌
-    * - ``enable_mem_efficient_sdp``
-      - Globally enables or disables Memory-Efficient Attention for SDPA.
-      - 2.1
-      - ❌
-    * - ``enable_math_sdp``
-      - Globally enables or disables the PyTorch C++ implementation within SDPA.
-      - 2.1
-      - ❌
-
-.. Need to validate and extend.
-
-torch.backends.cudnn
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Supported ``torch`` options include:
-
-.. list-table::
-    :header-rows: 1
-
-    * - Option
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - ``allow_tf32``
-      - TensorFloat-32 tensor cores may be used in cuDNN convolutions on NVIDIA
-        Ampere or newer GPUs.
-      - 1.12.0
-      - ❌
-    * - ``deterministic``
-      - A bool that, if True, causes cuDNN to only use deterministic
-        convolution algorithms.
-      - 1.12.0
-      - 6.0
-
-Automatic mixed precision: torch.amp
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-PyTorch automates the process of using both 16-bit (half-precision, float16) and
-32-bit (single-precision, float32) floating-point types in model training and
-inference.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - Autocasting
-      - Autocast instances serve as context managers or decorators that allow
-        regions of your script to run in mixed precision.
-      - 1.9
-      - 2.5
-    * - Gradient scaling
-      - To prevent underflow, “gradient scaling” multiplies the network’s
-        loss by a scale factor and invokes a backward pass on the scaled
-        loss. The same factor then scales gradients flowing backward through
-        the network. In other words, gradient values have a larger magnitude so
-        that they don’t flush to zero.
-      - 1.9
-      - 2.5
-    * - CUDA op-specific behavior
-      - These ops always go through autocasting whether they are invoked as part
-        of a ``torch.nn.Module``, as a function, or as a ``torch.Tensor`` method. If
-        functions are exposed in multiple namespaces, they go through
-        autocasting regardless of the namespace.
-      - 1.9
-      - 2.5
-
-Distributed library features
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-PyTorch distributed library includes a collective of parallelism modules, a
-communications layer, and infrastructure for launching and debugging large
-training jobs. See :ref:`rocm-for-ai-pytorch-distributed` for more information.
-
-The Distributed Library feature in PyTorch provides tools and APIs for building
-and running distributed machine learning workflows. It allows training models
-across multiple processes, GPUs, or nodes in a cluster, enabling efficient use
-of computational resources and scalability for large-scale tasks.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - TensorPipe
-      - A point-to-point communication library integrated into
-        PyTorch for distributed training. It handles tensor data transfers
-        efficiently between different processes or devices, including those on
-        separate machines.
-      - 1.8
-      - 5.4
-    * - Gloo
-      - Designed for multi-machine and multi-GPU setups, enabling
-        efficient communication and synchronization between processes. Gloo is
-        one of the default backends for PyTorch's Distributed Data Parallel
-        (DDP) and RPC frameworks, alongside other backends like NCCL and MPI.
-      - 1.0
-      - 2.0
-
-torch.compiler
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - ``torch.compiler`` (AOT Autograd)
-      - Autograd captures not only the user-level code, but also backpropagation,
-        which results in capturing the backwards pass “ahead-of-time”. This
-        enables acceleration of both forwards and backwards pass using
-        ``TorchInductor``.
-      - 2.0
-      - 5.3
-    * - ``torch.compiler`` (TorchInductor)
-      - The default ``torch.compile`` deep learning compiler that generates fast
-        code for multiple accelerators and backends. You need to use a backend
-        compiler to make speedups through ``torch.compile`` possible. For AMD,
-        NVIDIA, and Intel GPUs, it leverages OpenAI Triton as the key building block.
-      - 2.0
-      - 5.3
-
-torchaudio
+Supported modules
 --------------------------------------------------------------------------------

-The `torchaudio <https://pytorch.org/audio/stable/index.html>`_ library provides
-utilities for processing audio data in PyTorch, such as audio loading,
-transformations, and feature extraction.
+For a complete and up-to-date list of PyTorch core modules (for example., ``torch``,
+``torch.nn``, ``torch.cuda``, ``torch.backends.cuda`` and
+``torch.backends.cudnn``), their descriptions, and usage, please refer directly
+to the `official PyTorch documentation <https://pytorch.org/docs/stable/index.html>`_.

-To ensure GPU-acceleration with ``torchaudio.transforms``, you need to
-explicitly move audio data (waveform tensor) to GPU using ``.to('cuda')``.
+Core PyTorch functionality on ROCm includes tensor operations, neural network
+layers, automatic differentiation, distributed training, mixed-precision
+training, compilation features, and domain-specific libraries for audio, vision,
+text processing, and more.

-The following ``torchaudio`` features are GPU-accelerated.
+Supported domain libraries
+--------------------------------------------------------------------------------
+
+PyTorch offers specialized `domain libraries <https://pytorch.org/domains/>`_ with
+GPU acceleration that build on its core features to support specific application
+areas. The table below lists the PyTorch domain libraries that are compatible
+with ROCm.

 .. list-table::
    :header-rows: 1

-    * - Feature
+    * - Library
      - Description
-      - As of torchaudio version
-      - As of ROCm
-    * - ``torchaudio.transforms.Spectrogram``
-      - Generate a spectrogram of an input waveform using STFT.
-      - 0.6.0
-      - 4.5
-    * - ``torchaudio.transforms.MelSpectrogram``
-      - Generates the mel-scale spectrogram of raw audio signals.
-      - 0.9.0
-      - 4.5
-    * - ``torchaudio.transforms.MFCC``
-      - Extract of MFCC features.
-      - 0.9.0
-      - 4.5
-    * - ``torchaudio.transforms.Resample``
-      - Resamples a signal from one frequency to another.
-      - 0.9.0
-      - 4.5

-torchvision
--------------------------------------------------------------------------------
+    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_ 
+      - Audio and signal processing library for PyTorch. Provides utilities for
+        audio I/O, signal and data processing functions, datasets, model
+        implementations, and application components for audio and speech
+        processing tasks.

-The `torchvision <https://pytorch.org/vision/stable/index.html>`_ library
-provides datasets, model architectures, and common image transformations for
-computer vision.
+        **Note:** To ensure GPU-acceleration with ``torchaudio.transforms``,
+        you need to explicitly move audio data (waveform tensor) to GPU using
+        ``.to('cuda')``.

-The following ``torchvision`` features are GPU-accelerated.
+    * - `torchtune <https://docs.pytorch.org/torchtune/stable/index.html>`_
+      - PyTorch-native library designed for fine-tuning large language models
+        (LLMs). Provides supports the full fine-tuning workflow and offers
+        compatibility with popular production inference systems.

-.. list-table::
-    :header-rows: 1
+        **Note:** Only official release exists.

-    * - Feature
-      - Description
-      - As of torchvision version
-      - As of ROCm
-    * - ``torchvision.transforms.functional``
-      - Provides GPU-compatible transformations for image preprocessing like
-        resize, normalize, rotate and crop.
-      - 0.2.0
-      - 4.0
-    * - ``torchvision.ops``
-      - GPU-accelerated operations for object detection and segmentation tasks.
-        ``torchvision.ops.roi_align``, ``torchvision.ops.nms`` and
-        ``box_convert``.
-      - 0.6.0
-      - 3.3
-    * - ``torchvision.models`` with ``.to('cuda')``
-      - ``torchvision`` provides several pre-trained models (ResNet, Faster
-        R-CNN, Mask R-CNN, ...) that can run on CUDA for faster inference and
-        training.
-      - 0.1.6
-      - 2.x
-    * - ``torchvision.io``
-      - Enables video decoding and frame extraction using GPU acceleration with NVIDIA’s
-        NVDEC and nvJPEG (rocJPEG) on CUDA-enabled GPUs.
-      - 0.4.0
-      - 6.3
+    * - `torchvision <https://docs.pytorch.org/vision/stable/index.html>`_
+      - Computer vision library that is part of the PyTorch project. Provides
+        popular datasets, model architectures, and common image transformations
+        for computer vision applications.

-torchtext
--------------------------------------------------------------------------------
+    * - `torchtext <https://docs.pytorch.org/text/stable/index.html>`_
+      - Text processing library for PyTorch. Provides data processing utilities
+        and popular datasets for natural language processing, including
+        tokenization, vocabulary management, and text embeddings.

-The `torchtext <https://pytorch.org/text/stable/index.html>`_ library provides
-utilities for processing and working with text data in PyTorch, including
-tokenization, vocabulary management, and text embeddings. torchtext supports
-preprocessing pipelines and integration with PyTorch models, simplifying the
-implementation of natural language processing (NLP) tasks.
+        **Note:** ``torchtext`` does not implement ROCm-specific kernels. 
+        ROCm acceleration is provided through the underlying PyTorch framework
+        and ROCm library integration. Only official release exists.

-To leverage GPU acceleration in torchtext, you need to move tensors
-explicitly to the GPU using ``.to('cuda')``.
+    * - `torchdata <https://docs.pytorch.org/data/beta/index.html>`_
+      - Beta library of common modular data loading primitives for easily
+        constructing flexible and performant data pipelines, with features still
+        in prototype stage.

-* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
+    * - `torchrec <https://docs.pytorch.org/torchrec/>`_
+      - PyTorch domain library for common sparsity and parallelism primitives
+        needed for large-scale recommender systems, enabling authors to train
+        models with large embedding tables shared across many GPUs.

-* Only official release exists.
+        **Note:** ``torchrec`` does not implement ROCm-specific kernels. ROCm
+        acceleration is provided through the underlying PyTorch framework and
+        ROCm library integration.

-torchtune
--------------------------------------------------------------------------------
+    * - `torchserve <https://docs.pytorch.org/serve/>`_
+      - Performant, flexible and easy-to-use tool for serving PyTorch models in
+        production, providing features for model management, batch processing,
+        and scalable deployment.

-The `torchtune <https://pytorch.org/torchtune/stable/index.html>`_ library for
-authoring, fine-tuning and experimenting with LLMs.
+        **Note:** `torchserve <https://docs.pytorch.org/serve/>`_ is no longer
+        actively maintained. Last official release is sent out with PyTorch 2.4.

-* Usage: Enabling developers to fine-tune ROCm PyTorch solutions.
+    * - `torchrl <https://docs.pytorch.org/rl/stable/index.html>`_
+      - Open-source, Python-first Reinforcement Learning library for PyTorch
+        with a focus on high modularity and good runtime performance, providing
+        low and high-level RL abstractions and reusable functionals for cost
+        functions, returns, and data processing.

-* Only official release exists.
+        **Note:** Only official release exists.

-torchserve
--------------------------------------------------------------------------------
+    * - `tensordict <https://docs.pytorch.org/tensordict/stable/index.html>`_
+      - Dictionary-like class that simplifies operations on batches of tensors,
+        enhancing code readability, compactness, and modularity by abstracting
+        tailored operations and reducing errors through automatic operation
+        dispatching.

-The `torchserve <https://pytorch.org/serve/>`_ is a PyTorch domain library
-for common sparsity and parallelism primitives needed for large-scale recommender
-systems.
-
-* torchtext does not implement its own kernels. ROCm support is enabled by
-  linking against ROCm libraries.
-
-* Only official release exists.
-
-torchrec
--------------------------------------------------------------------------------
-
-The `torchrec <https://pytorch.org/torchrec/>`_ is a PyTorch domain library for
-common sparsity and parallelism primitives needed for large-scale recommender
-systems.
-
-* torchrec does not implement its own kernels. ROCm support is enabled by
-  linking against ROCm libraries.
-
-* Only official release exists.
-
-Unsupported PyTorch features
-================================================================================
-
-The following GPU-accelerated PyTorch features are not supported by ROCm for
-the listed supported PyTorch versions.
-
-.. list-table::
-    :widths: 30, 60, 10
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-    * - APEX batch norm
-      - Use APEX batch norm instead of PyTorch batch norm.
-      - 1.6.0
-    * - ``torch.backends.cuda`` / ``matmul.allow_tf32``
-      - A bool that controls whether TensorFloat-32 tensor cores may be used in
-        matrix multiplications.
-      - 1.7
-    * - ``torch.cuda`` / NVIDIA Tools Extension (NVTX)
-      - Integration with NVTX for profiling and debugging GPU performance using
-        NVIDIA's Nsight tools.
-      - 1.7.0
-    * - ``torch.cuda`` / Lazy loading NVRTC
-      - Delays JIT compilation with NVRTC until the code is explicitly needed.
-      - 1.8.0
-    * - ``torch-tensorrt``
-      - Integrate TensorRT library for optimizing and deploying PyTorch models.
-        ROCm does not have equialent library for TensorRT.
-      - 1.9.0
-    * - ``torch.backends`` / ``cudnn.allow_tf32``
-      - TensorFloat-32 tensor cores may be used in cuDNN convolutions.
-      - 1.10.0
-    * - ``torch.backends.cuda`` / ``matmul.allow_fp16_reduced_precision_reduction``
-      - Reduced precision reductions with fp16 accumulation type are
-        allowed with fp16 GEMMs.
-      - 2.0
-    * - ``torch.backends.cuda`` / ``matmul.allow_bf16_reduced_precision_reduction``
-      - Reduced precision reductions are allowed with bf16 GEMMs.
-      - 2.0
-    * - ``torch.nn.functional`` / ``scaled_dot_product_attention``
-      - Flash attention backend for SDPA to accelerate attention computation in
-        transformer-based models.
-      - 2.0
-    * - ``torch.backends.cuda`` / ``enable_cudnn_sdp``
-      - Globally enables cuDNN SDPA's kernels within SDPA.
-      - 2.0
-    * - ``torch.backends.cuda`` / ``enable_flash_sdp``
-      - Globally enables or disables FlashAttention for SDPA.
-      - 2.1
-    * - ``torch.backends.cuda`` / ``enable_mem_efficient_sdp``
-      - Globally enables or disables Memory-Efficient Attention for SDPA.
-      - 2.1
-    * - ``torch.backends.cuda`` / ``enable_math_sdp``
-      - Globally enables or disables the PyTorch C++ implementation within SDPA.
-      - 2.1
-    * - Dynamic parallelism
-      - PyTorch itself does not directly expose dynamic parallelism as a core
-        feature. Dynamic parallelism allow GPU threads to launch additional
-        threads which can be reached using custom operations via the
-        ``torch.utils.cpp_extension`` module.
-      - Not a core feature
-    * - Unified memory support in PyTorch
-      - Unified Memory is not directly exposed in PyTorch's core API, it can be
-        utilized effectively through custom CUDA extensions or advanced
-        workflows.
-      - Not a core feature
+        **Note:** Only official release exists.
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -0,0 +1,100 @@
+:orphan:
+
+.. meta::
+    :description: Stanford Megatron-LM compatibility
+    :keywords: Stanford, Megatron-LM, compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+Stanford Megatron-LM compatibility
+********************************************************************************
+
+Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
+designed to train massive transformer-based language models efficiently by model and data parallelism. 
+
+* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository. 
+* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
+* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled. 
+* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.
+
+.. note::
+
+	Stanford Megatron-LM is supported on ROCm 6.3.0.
+
+
+Supported Devices
+================================================================================
+
+- **Officially Supported**: AMD Instinct MI300X
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X
+
+
+Supported models and features
+================================================================================
+
+This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.
+
+Models:
+
+* Bert
+* GPT
+* T5
+* ICT
+
+Features:
+
+* Distributed Pre-training
+* Activation Checkpointing and Recomputation
+* Distributed Optimizer
+* Mixture-of-Experts
+
+.. _megatron-lm-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post  
+to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs. 
+Coverage includes:
+
+  * Single-GPU pre-training
+  * Multi-GPU pre-training
+
+
+.. _megatron-lm-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories represent the latest Megatron-LM version from the official Docker Hub.
+The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: 
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - Stanford Megatron-LM
+      - PyTorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
+      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+
+      
+
--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -56,7 +56,7 @@ Docker image compatibility
 AMD validates and publishes ready-made `TensorFlow images
 <https://hub.docker.com/r/rocm/tensorflow>`_ with ROCm backends on
 Docker Hub. The following Docker image tags and associated inventories are
-validated for `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_. Click
+validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click
 the |docker-icon| icon to view the image on Docker Hub.

 .. list-table:: TensorFlow Docker image components
@@ -73,82 +73,122 @@ the |docker-icon| icon to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.18-dev/images/sha256-fa9cf5fa6c6079a7118727531ccd0056c6e3224a42c3d6e78a49e7781daafff4"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - 24.04
-      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.18-runtime/images/sha256-14addca4b92a47c806b83ebaeed593fc6672cd99f0017ed8dad759fe72ed0309"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.18-runtime/images/sha256-d14d8c4989e7c9a60f4e72461b9e349de72347c6162dcd6897e6f4f80ffbb440"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 24.04
-      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.18-dev/images/sha256-f5e151060df04ff5fb59f5604b49cd371931bbe75b06aec9fe7781397c4be0ce"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.18-dev/images/sha256-081e5bd6615a5dc17247ebd2ccc26895c3feeff086720400fa39b477e60a77c0"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - dev
      - 22.04
-      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.18-runtime/images/sha256-5cd4c03fdb1036570c0d4929da60a65c4466998dc80f1dc8a5a0b173eae017fb"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.18-runtime/images/sha256-bf369637378264f4af6ddad5ca8b8611d3e372ffbea9ab7a06f1e122f0a0867b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 22.04
-      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.17-dev/images/sha256-b3add80e374a2db2d1088d746e740afa89d439aca02cacba959ad298f5cd2b3f"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.17-dev/images/sha256-5a502008c50d0b6508e6027f911bdff070a7493700ae064bed74e1d22b91ed50"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - 24.04
-      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.17-runtime/images/sha256-3a244f026c32177eff7958ffbad390de85b438b2b48b455cc39f15d70fa1270d"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.17-runtime/images/sha256-1ee5dfffceb71ac66617ada33de3a10de0cb74199cc4b82441192e5e92fa2ddf"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 24.04
-      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-3124/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.17-dev/images/sha256-e0cecdfacb59169335049983cdab6da578c209bb9f4d08aad97e184ae59171a6"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.17-dev/images/sha256-109218ad92bfae83bbd2710475f7502166e1ed54ca0b9748a9cbc3f5a1d75af1"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - 22.04
-      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.17-runtime/images/sha256-6f43de12f7eb202791b698ac51d28b72098de90034dbcd48486629b0125f7707"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.17-runtime/images/sha256-5d78bd5918d394f92263daa2990e88d695d27200dd90ed83ec64d20c7661c9c1"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 22.04
-      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_

+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.16-dev/images/sha256-b09b1ad921c09c687b7c916141051e9fcf15539a5686e5aa67c689195a522719"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - dev
+      - 24.04
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.16-runtime/images/sha256-20dbd824e85558abfe33fc9283cc547d88cde3c623fe95322743a5082f883a64"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - runtime
+      - 24.04
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377>`__
+      - dev
+      - 22.04
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-runtime/images/sha256-a94150ffb81365234ebfa34e764db5474bc6ab7d141b56495eac349778dafcf3"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - runtime
+      - 22.04
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
+

 Critical ROCm libraries for TensorFlow
 ===============================================================================
--- a/docs/compatibility/ml-compatibility/verl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/verl-compatibility.rst
@@ -0,0 +1,85 @@
+:orphan:
+
+.. meta::
+   :description: verl compatibility
+   :keywords: GPU, verl compatibility
+
+.. version-set:: rocm_version latest
+
+*******************************************************************************
+verl compatibility
+*******************************************************************************
+
+Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs). 
+verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.
+
+* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl. 
+* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
+* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled. 
+* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to get started.
+
+.. note::
+
+	verl is supported on ROCm 6.2.0.
+
+
+.. _verl-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+The benefits of verl in large-scale reinforcement leaning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.
+
+.. _verl-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl>`_
+with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the latest verl version from the official Docker Hub. The Docker images have been validated for `ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`_. 
+
+.. list-table:: 
+    :header-rows: 1
+
+    *   - Docker image
+        - verl
+        - Linux
+        - Pytorch
+        - Python
+        - vllm
+
+    *   - .. raw:: html
+
+            <a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
+        - `0.3.0post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`_
+        - Ubuntu 20.04
+        - `2.5.0 <https://download.pytorch.org/whl/cu118/torch-2.5.0%2Bcu118-cp39-cp39-linux_x86_64.whl#sha256=1ee24b267418c37b297529ede875b961e382c1c365482f4142af2398b92ed127>`_
+        - `3.9.19 <https://www.python.org/downloads/release/python-3919/>`_
+        - `0.6.4 <https://github.com/vllm-project/vllm/releases/tag/v0.6.4>`_
+
+
+Supported features
+===============================================================================
+
+The following table shows verl and ROCm support for GPU-accelerated modules.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Module
+      - Description
+      - verl version
+      - ROCm version
+    * - ``FSDP``
+      - Training engine
+      - 0.3.0.post0
+      - 6.2
+    * - ``vllm``
+      - Inference engine
+      - 0.3.0.post0
+      - 6.2
+  
--- a/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
@@ -8,7 +8,7 @@ MI300 and MI200 series performance counters and metrics

 This document lists and describes the hardware performance counters and derived metrics available
 for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
-:doc:`ROCProfiler tool <rocprofiler:rocprofv1>`.
+:doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.

 MI300 and MI200 series performance counters
 ===============================================================
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -34,15 +34,15 @@ project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "6.4.0"
-release = "6.4.0"
+version = "6.4.1"
+release = "6.4.1"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-04-11"},
+    {"file": "about/release-notes", "os": ["linux"], "date": "2025-05-07"},
    {"file": "release/changelog", "os": ["linux"],},
    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
@@ -51,14 +51,28 @@ article_pages = [
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
@@ -67,11 +81,20 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/install", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/pytorch-inference-benchmark", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
@@ -127,6 +150,7 @@ html_theme_options = {"link_main_doc": False}
 redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}

 numfig = False
+suppress_warnings = ["autosectionlabel.*"]

 html_context = {
    "project_path" : {project_path},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
@@ -0,0 +1,159 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640
+      rocm_version: 6.3.1
+      vllm_version: 0.7.3
+      pytorch_version: 2.7.0 (dev nightly)
+      hipblaslt_version: 0.13
+  model_groups:
+    - group: Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 3.2 11B Vision
+        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+    - group: JAIS
+      tag: jais
+      models:
+      - model: JAIS 13B
+        mad_tag: pyt_vllm_jais-13b
+        model_repo: core42/jais-13b-chat
+        url: https://huggingface.co/core42/jais-13b-chat
+        precision: float16
+      - model: JAIS 30B
+        mad_tag: pyt_vllm_jais-30b
+        model_repo: core42/jais-30b-chat-v3
+        url: https://huggingface.co/core42/jais-30b-chat-v3
+        precision: float16
+    - group: DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
@@ -0,0 +1,152 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845
+      rocm_version: 6.3.1
+      vllm_version: 0.8.3
+      pytorch_version: 2.7.0 (dev nightly)
+      hipblaslt_version: 0.13
+  model_groups:
+    - group: Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 3.2 11B Vision
+        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
@@ -0,0 +1,152 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250513
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4
+      rocm_version: 6.3.1
+      vllm_version: 0.8.5
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 3.2 11B Vision
+        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
@@ -0,0 +1,167 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
+      rocm_version: 6.3.1
+      vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+        - model: Llama 3.1 8B
+          mad_tag: pyt_vllm_llama-3.1-8b
+          model_repo: meta-llama/Llama-3.1-8B-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.1-8B
+          precision: float16
+        - model: Llama 3.1 70B
+          mad_tag: pyt_vllm_llama-3.1-70b
+          model_repo: meta-llama/Llama-3.1-70B-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+          precision: float16
+        - model: Llama 3.1 405B
+          mad_tag: pyt_vllm_llama-3.1-405b
+          model_repo: meta-llama/Llama-3.1-405B-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+          precision: float16
+        - model: Llama 3.2 11B Vision
+          mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+          model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+          precision: float16
+        - model: Llama 2 7B
+          mad_tag: pyt_vllm_llama-2-7b
+          model_repo: meta-llama/Llama-2-7b-chat-hf
+          url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+          precision: float16
+        - model: Llama 2 70B
+          mad_tag: pyt_vllm_llama-2-70b
+          model_repo: meta-llama/Llama-2-70b-chat-hf
+          url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+          precision: float16
+        - model: Llama 3.1 8B FP8
+          mad_tag: pyt_vllm_llama-3.1-8b_fp8
+          model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+          url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+          precision: float8
+        - model: Llama 3.1 70B FP8
+          mad_tag: pyt_vllm_llama-3.1-70b_fp8
+          model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+          url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+          precision: float8
+        - model: Llama 3.1 405B FP8
+          mad_tag: pyt_vllm_llama-3.1-405b_fp8
+          model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+          url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+          precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+        - model: Mixtral MoE 8x7B
+          mad_tag: pyt_vllm_mixtral-8x7b
+          model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+          url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+          precision: float16
+        - model: Mixtral MoE 8x22B
+          mad_tag: pyt_vllm_mixtral-8x22b
+          model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+          url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+          precision: float16
+        - model: Mistral 7B
+          mad_tag: pyt_vllm_mistral-7b
+          model_repo: mistralai/Mistral-7B-Instruct-v0.3
+          url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+          precision: float16
+        - model: Mixtral MoE 8x7B FP8
+          mad_tag: pyt_vllm_mixtral-8x7b_fp8
+          model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+          url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+          precision: float8
+        - model: Mixtral MoE 8x22B FP8
+          mad_tag: pyt_vllm_mixtral-8x22b_fp8
+          model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+          url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+          precision: float8
+        - model: Mistral 7B FP8
+          mad_tag: pyt_vllm_mistral-7b_fp8
+          model_repo: amd/Mistral-7B-v0.1-FP8-KV
+          url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+          precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+        - model: Qwen2 7B
+          mad_tag: pyt_vllm_qwen2-7b
+          model_repo: Qwen/Qwen2-7B-Instruct
+          url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+          precision: float16
+        - model: Qwen2 72B
+          mad_tag: pyt_vllm_qwen2-72b
+          model_repo: Qwen/Qwen2-72B-Instruct
+          url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+          precision: float16
+        - model: QwQ-32B
+          mad_tag: pyt_vllm_qwq-32b
+          model_repo: Qwen/QwQ-32B
+          url: https://huggingface.co/Qwen/QwQ-32B
+          precision: float16
+          tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+        - model: DBRX Instruct
+          mad_tag: pyt_vllm_dbrx-instruct
+          model_repo: databricks/dbrx-instruct
+          url: https://huggingface.co/databricks/dbrx-instruct
+          precision: float16
+        - model: DBRX Instruct FP8
+          mad_tag: pyt_vllm_dbrx_fp8
+          model_repo: amd/dbrx-instruct-FP8-KV
+          url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+          precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+        - model: Gemma 2 27B
+          mad_tag: pyt_vllm_gemma-2-27b
+          model_repo: google/gemma-2-27b
+          url: https://huggingface.co/google/gemma-2-27b
+          precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+        - model: C4AI Command R+ 08-2024
+          mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+          model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+          url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+          precision: float16
+        - model: C4AI Command R+ 08-2024 FP8
+          mad_tag: pyt_vllm_command-r-plus_fp8
+          model_repo: amd/c4ai-command-r-plus-FP8-KV
+          url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+          precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+        - model: DeepSeek MoE 16B
+          mad_tag: pyt_vllm_deepseek-moe-16b-chat
+          model_repo: deepseek-ai/deepseek-moe-16b-chat
+          url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+          precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+        - model: Phi-4
+          mad_tag: pyt_vllm_phi-4
+          model_repo: microsoft/phi-4
+          url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+        - model: Falcon 180B
+          mad_tag: pyt_vllm_falcon-180b
+          model_repo: tiiuae/falcon-180B
+          url: https://huggingface.co/tiiuae/falcon-180B
+          precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
@@ -0,0 +1,162 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
+      rocm_version: 6.4.1
+      vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
@@ -0,0 +1,163 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab
+      rocm_version: 6.4.1
+      vllm_version: 0.9.1 (0.9.2.dev206+gb335519f2.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -23,3 +23,19 @@ pytorch_inference_benchmark:
        model_repo: meta-llama/Llama-3.1-8B-Instruct
        url: https://huggingface.co/chaidiscovery/chai-1
        precision: float16
+    - group: Mochi Video
+      tag: mochi
+      models:
+      - model: Mochi 1
+        mad_tag: pyt_mochi_video_inference
+        model_repo: genmo/mochi-1-preview
+        url: https://huggingface.co/genmo/mochi-1-preview
+        precision: float16
+    - group: Wan2.1
+      tag: wan
+      models:
+      - model: Wan2.1
+        mad_tag: pyt_wan2.1_inference
+        model_repo: Wan-AI/Wan2.1-T2V-14B
+        url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
+        precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,14 +1,15 @@
 vllm_benchmark:
  unified_docker:
    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845
-      rocm_version: 6.3.1
-      vllm_version: 0.8.3
-      pytorch_version: 2.7.0 (dev nightly)
-      hipblaslt_version: 0.13
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
+      rocm_version: 6.4.1
+      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
  model_groups:
-    - group: Llama
+    - group: Meta Llama
      tag: llama
      models:
      - model: Llama 3.1 8B
@@ -26,11 +27,6 @@ vllm_benchmark:
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
-      - model: Llama 3.2 11B Vision
-        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
-        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
-        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
@@ -56,7 +52,7 @@ vllm_benchmark:
        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
        precision: float8
-    - group: Mistral
+    - group: Mistral AI
      tag: mistral
      models:
      - model: Mixtral MoE 8x7B
@@ -108,7 +104,7 @@ vllm_benchmark:
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
        tunableop: true
-    - group: DBRX
+    - group: Databricks DBRX
      tag: dbrx
      models:
      - model: DBRX Instruct
@@ -121,7 +117,7 @@ vllm_benchmark:
        model_repo: amd/dbrx-instruct-FP8-KV
        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
        precision: float8
-    - group: Gemma
+    - group: Google Gemma
      tag: gemma
      models:
      - model: Gemma 2 27B
@@ -150,3 +146,18 @@ vllm_benchmark:
        model_repo: deepseek-ai/deepseek-moe-16b-chat
        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -0,0 +1,60 @@
+dockers:
+  - pull_tag: rocm/megatron-lm:v25.6_py312
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: 3.12
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 24.04 + Python 3.12
+  - pull_tag: rocm/megatron-lm:v25.6_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: "3.10"
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 22.04 + Python 3.10
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 3.1 70B (proxy)
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
@@ -0,0 +1,29 @@
+megatron-lm_benchmark:
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek-V3
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -0,0 +1,120 @@
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
+model_groups:
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora, HF_finetune_lora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/data/rocm-software-stack-6_4_0.jpg
+++ b/docs/data/rocm-software-stack-6_4_0.jpg
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -17,6 +17,9 @@ features for these ROCm-enabled deep learning frameworks.
 * :doc:`PyTorch compatibility <../compatibility/ml-compatibility/pytorch-compatibility>`
 * :doc:`TensorFlow compatibility <../compatibility/ml-compatibility/tensorflow-compatibility>`
 * :doc:`JAX compatibility <../compatibility/ml-compatibility/jax-compatibility>`
+* :doc:`verl compatibility <../compatibility/ml-compatibility/verl-compatibility>`
+* :doc:`Stanford Megatron-LM compatibility <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
+* :doc:`DGL compatibility <../compatibility/ml-compatibility/dgl-compatibility>`

 This chart steps through typical installation workflows for installing deep learning frameworks for ROCm.

@@ -29,6 +32,9 @@ See the installation instructions to get started.
 * :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
 * :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
 * :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
+* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
+* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
+* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/dgl-install>`

 .. note::

--- a/docs/how-to/gpu-performance/mi300x.rst
+++ b/docs/how-to/gpu-performance/mi300x.rst
@@ -7,21 +7,21 @@ AMD Instinct MI300X performance guides
 **************************************

 The following performance guides provide essential guidance on the necessary
-steps to properly :doc:`configure your system for AMD Instinct™ MI300X
-accelerators <../system-optimization/mi300x>`. They include detailed
-instructions on system settings and application :doc:`workload tuning
-<../rocm-for-ai/inference-optimization/workload>` to help you
-leverage the maximum capabilities of these accelerators and achieve superior
-performance.
+steps to properly `configure your system for AMD Instinct™ MI300X accelerators
+<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+They include detailed instructions on system settings and application
+:doc:`workload tuning </how-to/rocm-for-ai/inference-optimization/workload>` to
+help you leverage the maximum capabilities of these accelerators and achieve
+superior performance.

 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
  covers essential system settings and system management practices to configure
  your AMD Instinct MI300X system for performance.

-* :doc:`../rocm-for-ai/inference-optimization/workload` covers steps to
+* :doc:`/how-to/rocm-for-ai/inference-optimization/workload` covers steps to
  optimize the performance of AMD Instinct MI300X series accelerators for HPC
  and deep learning operations.

-* :doc:`../rocm-for-ai/inference/vllm-benchmark` introduces a preconfigured
+* :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` introduces a preconfigured
  environment for LLM inference, designed to help you test performance with
  popular models on AMD Instinct MI300X series accelerators.
--- a/docs/how-to/rocm-for-ai/fine-tuning/index.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/index.rst
@@ -24,5 +24,3 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
 - :doc:`Fine-tuning and inference <fine-tuning-and-inference>` using a
  :doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` or
  :doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` system.
-
-
--- a/docs/how-to/rocm-for-ai/index.rst
+++ b/docs/how-to/rocm-for-ai/index.rst
@@ -6,7 +6,7 @@
 Use ROCm for AI
 **************************

-ROCm™ is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.
+ROCm is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.

 You can use ROCm to perform distributed training, which enables you to train models across multiple GPUs or nodes simultaneously. Additionally, ROCm supports mixed-precision training, which can help reduce the memory and compute requirements of training workloads. For fine-tuning, ROCm provides access to various algorithms and optimization techniques. In terms of inference, ROCm provides several techniques that can help you optimize your models for deployment, such as quantization, GEMM tuning, and optimization with composable kernel.
 
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -151,8 +151,8 @@ desired effect. Continuous iteration helps refine the performance gains and
 address any new bottlenecks that may emerge.

 ROCm provides a prebuilt optimized Docker image that has everything required to implement
-the tips in this section. It includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
-format. For more information, see :doc:`../inference/vllm-benchmark`.
+the LLM inference tips in this section. It includes ROCm, PyTorch, and vLLM.
+For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _mi300x-profiling-tools:

@@ -343,9 +343,10 @@ The following performance tips are not *specific* to vLLM -- they are general
 but relevant in this context. You can tune the following vLLM parameters to
 achieve optimal request latency and throughput performance.

-* As described in :ref:`mi300x-env-vars`, the environment
-  variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM performance. Set it to
-  ``export HIP_FORCE_DEV_KERNARG=1``.
+* As described in `Environment variables (MI300X)
+  <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
+  the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
+  performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.

 * Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
  to ``112`` to increase the number of channels on MI300X to potentially improve
@@ -410,9 +411,9 @@ for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
 usage with ROCm.

 ROCm provides a prebuilt optimized Docker image for validating the performance
-of LLM inference with vLLM on the MI300X accelerator. The Docker image includes
-ROCm, vLLM, PyTorch, and tuning files in the CSV format. For more information,
-see :doc:`../inference/vllm-benchmark`.
+of LLM inference with vLLM on MI300X series accelerators. The Docker image includes
+ROCm, vLLM, and PyTorch. For more information, see
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _mi300x-vllm-throughput-measurement:

@@ -678,7 +679,7 @@ To specify the quantization scaling config, use the
 ``--quantization-param-path`` parameter. If the parameter is not specified,
 the default scaling factor of ``1`` is used, which can lead to less accurate
 results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
-Cache <https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md>`__
+Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
 in the vLLM GitHub repository.

 Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
@@ -1477,8 +1478,9 @@ following command: ``cat /proc/sys/kernel/numa_balancing`` and
 checking whether the output is ``0``.

 If the output is ``1``, you can disable NUMA auto-balancing by running the
-following command: ``sudo sysctl kernel.numa_balancing=0``. For more
-details, see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+following command: ``sudo sysctl kernel.numa_balancing=0``. For more details,
+see `AMD Instinct MI300X system optimization
+<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_.

 .. _mi300x-rccl-disable-acs:

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
@@ -0,0 +1,346 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
+                 ROCm Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment designed for validating large language model
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
+ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
+MI300X accelerator and includes the following components:
+
+* `ROCm 6.2.0 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.4.3 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.4.0 <https://github.com/pytorch/pytorch>`_
+
+* Tuning files (in CSV format)
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers on the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models.
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and
+   serving. It deploys the PagedAttention algorithm, which reduces memory
+   consumption and increases throughput by leveraging dynamic key and value
+   allocation in GPU memory. vLLM also incorporates many LLM acceleration
+   and quantization algorithms. In addition, AMD implements high-performance
+   custom kernels and modules in vLLM to enhance performance further. See
+   :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for more
+   information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
+
+Once setup is complete, you can choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
+
+.. _vllm-benchmark-mad:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``
+
+Although the following eight models are pre-configured to collect latency and
+throughput performance data, users can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
+
+Available models
+----------------
+
+.. hlist::
+   :columns: 3
+
+   * ``pyt_vllm_llama-3.1-8b``
+
+   * ``pyt_vllm_llama-3.1-70b``
+
+   * ``pyt_vllm_llama-3.1-405b``
+
+   * ``pyt_vllm_llama-2-7b``
+
+   * ``pyt_vllm_mistral-7b``
+
+   * ``pyt_vllm_qwen2-7b``
+
+   * ``pyt_vllm_jais-13b``
+
+   * ``pyt_vllm_jais-30b``
+
+.. _vllm-benchmark-standalone:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name unified_docker_vllm rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Multiprocessing distributed executor
+--------------------------------------
+
+To optimize vLLM performance, add the multiprocessing API server argument ``--distributed-executor-backend mp``.
+
+Command
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-options:
+
+Options
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
+     - Llama 3.1 8B
+
+   * - (``float16``)
+     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
+     - Llama 3.1 70B
+
+   * -
+     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
+     - Llama 3.1 405B
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - Llama 2 7B
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - Mixtral 8x7B
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - Mixtral 8x22B
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - Mixtral 7B
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - Qwen2 7B
+
+   * -
+     - ``core42/jais-13b-chat``
+     - JAIS 13B
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - JAIS 30B
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+Latency benchmark example
+^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` data type.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+
+Find the latency report at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
+
+Throughput benchmark example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
@@ -0,0 +1,416 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
+                 ROCm Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment designed for validating large language model
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
+ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
+MI300X accelerator and includes the following components:
+
+* `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.6.4 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.5.0 <https://github.com/pytorch/pytorch>`_
+
+* Tuning files (in CSV format)
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers on the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models.
+
+.. hlist::
+   :columns: 6
+
+   * Llama 3.1 8B
+
+   * Llama 3.1 70B
+
+   * Llama 3.1 405B
+
+   * Llama 2 7B
+
+   * Llama 2 70B
+
+   * Mixtral 8x7B
+
+   * Mixtral 8x22B
+
+   * Mixtral 7B
+
+   * Qwen2 7B
+
+   * Qwen2 72B
+
+   * JAIS 13B
+
+   * JAIS 30B
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+Once setup is complete, you can choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
+
+.. _vllm-benchmark-mad:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``.
+
+Although the following models are preconfigured to collect latency and
+throughput performance data, you can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
+
+Available models
+----------------
+
+.. hlist::
+   :columns: 3
+
+   * ``pyt_vllm_llama-3.1-8b``
+
+   * ``pyt_vllm_llama-3.1-70b``
+
+   * ``pyt_vllm_llama-3.1-405b``
+
+   * ``pyt_vllm_llama-2-7b``
+
+   * ``pyt_vllm_llama-2-70b``
+
+   * ``pyt_vllm_mixtral-8x7b``
+
+   * ``pyt_vllm_mixtral-8x22b``
+
+   * ``pyt_vllm_mistral-7b``
+
+   * ``pyt_vllm_qwen2-7b``
+
+   * ``pyt_vllm_qwen2-72b``
+
+   * ``pyt_vllm_jais-13b``
+
+   * ``pyt_vllm_jais-30b``
+
+   * ``pyt_vllm_llama-3.1-8b_fp8``
+
+   * ``pyt_vllm_llama-3.1-70b_fp8``
+
+   * ``pyt_vllm_llama-3.1-405b_fp8``
+
+   * ``pyt_vllm_mixtral-8x7b_fp8``
+
+   * ``pyt_vllm_mixtral-8x22b_fp8``
+
+.. _vllm-benchmark-standalone:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.4 rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Command
+-------
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-options:
+
+Options
+-------
+
+.. list-table::
+   :header-rows: 1
+   :align: center
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
+     - Llama 3.1 8B
+
+   * - (``float16``)
+     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
+     - Llama 3.1 70B
+
+   * -
+     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
+     - Llama 3.1 405B
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - Llama 2 7B
+
+   * -
+     - ``meta-llama/Llama-2-70b-chat-hf``
+     - Llama 2 70B
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - Mixtral 8x7B
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - Mixtral 8x22B
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - Mixtral 7B
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - Qwen2 7B
+
+   * -
+     - ``Qwen/Qwen2-72B-Instruct``
+     - Qwen2 72B
+
+   * -
+     - ``core42/jais-13b-chat``
+     - JAIS 13B
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - JAIS 30B
+
+   * - ``$model_repo``
+     - ``amd/Meta-Llama-3.1-8B-Instruct-FP8-KV``
+     - Llama 3.1 8B
+
+   * - (``float8``)
+     - ``amd/Meta-Llama-3.1-70B-Instruct-FP8-KV``
+     - Llama 3.1 70B
+
+   * -
+     - ``amd/Meta-Llama-3.1-405B-Instruct-FP8-KV``
+     - Llama 3.1 405B
+
+   * -
+     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
+     - Mixtral 8x7B
+
+   * -
+     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
+     - Mixtral 8x22B
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16`` or ``float8``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+Example 1: latency benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+   ./vllm_benchmark_report.sh -s latency -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
+
+Find the latency reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
+
+- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_latency_report.csv``
+
+Example 2: throughput benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+   ./vllm_benchmark_report.sh -s throughput -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
+
+- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
@@ -0,0 +1,461 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+***********************************************************
+LLM inference performance validation on AMD Instinct MI300X
+***********************************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment for validating large language model (LLM)
+inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
+Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
+accelerator and includes the following components:
+
+* `ROCm 6.3.1 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.6.6 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.7.0 (2.7.0a0+git3a58512) <https://github.com/pytorch/pytorch>`_
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers for the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models. For more information, see the lists of
+:ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-models>`
+and :ref:`standalone benchmarking <vllm-benchmark-standalone-options>`.
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+
+Once the setup is complete, choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
+
+.. _vllm-benchmark-mad:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``.
+
+Although the following models are preconfigured to collect latency and
+throughput performance data, you can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
+
+.. _vllm-benchmark-mad-models:
+
+Available models
+----------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 2, 3
+
+   * - Model name
+     - Tag
+
+   * - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
+     - ``pyt_vllm_llama-3.1-8b``
+
+   * - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+     - ``pyt_vllm_llama-3.1-70b``
+
+   * - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
+     - ``pyt_vllm_llama-3.1-405b``
+
+   * - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
+     - ``pyt_vllm_llama-3.2-11b-vision-instruct``
+
+   * - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
+     - ``pyt_vllm_llama-2-7b``
+
+   * - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
+     - ``pyt_vllm_llama-2-70b``
+
+   * - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
+     - ``pyt_vllm_mixtral-8x7b``
+
+   * - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
+     - ``pyt_vllm_mixtral-8x22b``
+
+   * - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
+     - ``pyt_vllm_mistral-7b``
+
+   * - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
+     - ``pyt_vllm_qwen2-7b``
+
+   * - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
+     - ``pyt_vllm_qwen2-72b``
+
+   * - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
+     - ``pyt_vllm_jais-13b``
+
+   * - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
+     - ``pyt_vllm_jais-30b``
+
+   * - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
+     - ``pyt_vllm_dbrx-instruct``
+
+   * - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
+     - ``pyt_vllm_gemma-2-27b``
+
+   * - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
+     - ``pyt_vllm_c4ai-command-r-plus-08-2024``
+
+   * - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
+     - ``pyt_vllm_deepseek-moe-16b-chat``
+
+   * - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
+     - ``pyt_vllm_llama-3.1-70b_fp8``
+
+   * - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
+     - ``pyt_vllm_llama-3.1-405b_fp8``
+
+   * - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
+     - ``pyt_vllm_mixtral-8x7b_fp8``
+
+   * - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
+     - ``pyt_vllm_mixtral-8x22b_fp8``
+
+   * - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
+     - ``pyt_vllm_mistral-7b_fp8``
+
+   * - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
+     - ``pyt_vllm_dbrx_fp8``
+
+   * - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
+     - ``pyt_vllm_command-r-plus_fp8``
+
+.. _vllm-benchmark-standalone:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Command
+-------
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-options:
+
+Options and available models
+----------------------------
+
+.. list-table::
+   :header-rows: 1
+   :align: center
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Llama-3.1-8B-Instruct``
+     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
+
+   * - (``float16``)
+     - ``meta-llama/Llama-3.1-70B-Instruct``
+     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+   * -
+     - ``meta-llama/Llama-3.1-405B-Instruct``
+     - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
+
+   * -
+     - ``meta-llama/Llama-3.2-11B-Vision-Instruct``
+     - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
+
+   * -
+     - ``meta-llama/Llama-2-70b-chat-hf``
+     - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
+
+   * -
+     - ``Qwen/Qwen2-72B-Instruct``
+     - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
+
+   * -
+     - ``core42/jais-13b-chat``
+     - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
+
+   * -
+     - ``databricks/dbrx-instruct``
+     - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
+
+   * -
+     - ``google/gemma-2-27b``
+     - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
+
+   * -
+     - ``CohereForAI/c4ai-command-r-plus-08-2024``
+     - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
+
+   * -
+     - ``deepseek-ai/deepseek-moe-16b-chat``
+     - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
+
+   * - ``$model_repo``
+     - ``amd/Llama-3.1-70B-Instruct-FP8-KV``
+     - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
+
+   * - (``float8``)
+     - ``amd/Llama-3.1-405B-Instruct-FP8-KV``
+     - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
+
+   * -
+     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
+     - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
+
+   * -
+     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
+     - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
+
+   * -
+     - ``amd/Mistral-7B-v0.1-FP8-KV``
+     - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
+
+   * -
+     - ``amd/dbrx-instruct-FP8-KV``
+     - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
+
+   * -
+     - ``amd/c4ai-command-r-plus-FP8-KV``
+     - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16`` or ``float8``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+Example 1: latency benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
+   ./vllm_benchmark_report.sh -s latency -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
+
+Find the latency reports at:
+
+- ``./reports_float16/summary/Llama-3.1-70B-Instruct_latency_report.csv``
+
+- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_latency_report.csv``
+
+Example 2: throughput benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
+   ./vllm_benchmark_report.sh -s throughput -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Llama-3.1-70B-Instruct_throughput_report.csv``
+
+- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
@@ -0,0 +1,329 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerator. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models:
+
+   Available models
+   ================
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/25070a1841df0dca585b7ddcb967c42aaec4b7c5/docs/dev-docker>`__.
+
+   Getting started
+   ===============
+
+   Use the following procedures to reproduce the benchmark results on an
+   MI300X accelerator with the prebuilt vLLM Docker image.
+
+   .. _vllm-benchmark-get-started:
+
+   1. Disable NUMA auto-balancing.
+
+      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+      might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+      .. code-block:: shell
+
+         # disable automatic NUMA balancing
+         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+         # check if NUMA balancing is disabled (returns 0 if disabled)
+         cat /proc/sys/kernel/numa_balancing
+         0
+
+   2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+
+      Use the following command to pull the Docker image from Docker Hub.
+
+      .. code-block:: shell
+
+         docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
@@ -0,0 +1,345 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models:
+
+   Supported models
+   ================
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7a9f58aae0e7215a5f3dccde60e35072c41656c2/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
@@ -0,0 +1,354 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models:
+
+   Supported models
+   ================
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/16d2b92ebcf90fe55cf73fa0b9329a6c9d3dede8/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
@@ -7,9 +9,14 @@
 vLLM inference performance testing
 **********************************

+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
 .. _vllm-benchmark-unified-docker:

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml

   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -24,7 +31,7 @@ vLLM inference performance testing

   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_

-   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_

   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_

@@ -37,11 +44,15 @@ vLLM inference performance testing
   Supported models
   ================

+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="col-2 me-2 model-param-head">Model group</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
@@ -50,7 +61,7 @@ vLLM inference performance testing
        </div>

        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
@@ -111,35 +122,37 @@ vLLM inference performance testing
   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.

-   Getting started
-   ===============
+   System validation
+   =================

-   Use the following procedures to reproduce the benchmark results on an
-   MI300X accelerator with the prebuilt vLLM Docker image.
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.

-   .. _vllm-benchmark-get-started:
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.

-   1. Disable NUMA auto-balancing.
+   .. code-block:: shell

-      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-      might hang until the periodic balancing is finalized. For more information,
-      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0

-      .. code-block:: shell
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.

-         # disable automatic NUMA balancing
-         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-         # check if NUMA balancing is disabled (returns 0 if disabled)
-         cat /proc/sys/kernel/numa_balancing
-         0
+   Pull the Docker image
+   =====================

-   2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.

-      Use the following command to pull the Docker image from Docker Hub.
+   .. code-block:: shell

-      .. code-block:: shell
-
-         docker pull {{ unified_docker.pull_tag }}
+      docker pull {{ unified_docker.pull_tag }}

   Benchmarking
   ============
@@ -316,64 +329,27 @@ vLLM inference performance testing
 Further reading
 ===============

- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../inference-optimization/workload`.
-
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <hugging-face-models>`.
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../inference-optimization/index>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================

-This table lists previous versions of the ROCm vLLM inference Docker image for
-inference performance testing. For detailed information about available models
-for benchmarking, see the version-specific documentation.
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.

-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - ROCm version
-     - vLLM version
-     - PyTorch version
-     - Resources
-
-   * - 6.3.1
-     - 0.7.3
-     - 2.7.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`_
-
-   * - 6.3.1
-     - 0.6.6
-     - 2.7.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
-
-   * - 6.2.1
-     - 0.6.4
-     - 2.5.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
-
-   * - 6.2.0
-     - 0.4.3
-     - 2.4.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
@@ -0,0 +1,353 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models:
+
+   Supported models
+   ================
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the latest version of this inference benchmarking environment.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
@@ -0,0 +1,353 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements-20250702>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models-20250702:
+
+   Supported models
+   ================
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements-20250702:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the latest version of this inference benchmarking environment.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/5486e7bc8523be0324ccd68f221959445b56cc2a/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models-20250702>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -0,0 +1,109 @@
+:orphan:
+
+**************************************************
+vLLM inference performance testing version history
+**************************************************
+
+This table lists previous versions of the ROCm vLLM inference Docker image for
+inference performance testing. For detailed information about available models
+for benchmarking, see the version-specific documentation. You can find tagged
+previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`_.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Docker image tag
+     - Components
+     - Resources
+
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
+       (latest)
+     - 
+       * ROCm 6.4.1
+       * vLLM 0.9.1
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <../vllm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
+
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
+     - 
+       * ROCm 6.4.1
+       * vLLM 0.9.1
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.9.1-20250702>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab>`__
+
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605``
+     - 
+       * ROCm 6.4.1
+       * vLLM 0.9.0.1
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.9.0.1-20250605>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c>`_
+
+   * - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250521``
+     - 
+       * ROCm 6.3.1
+       * 0.8.5 vLLM (0.8.6.dev)
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.8.5-20250521>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`__
+
+   * - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513``
+     - 
+       * ROCm 6.3.1
+       * vLLM 0.8.5
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.8.5-20250513>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`__
+
+   * - ``rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415``
+     - 
+       * ROCm 6.3.1
+       * vLLM 0.8.3
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.8.3-20250415>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`__
+
+   * - ``rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325``
+     - 
+       * ROCm 6.3.1
+       * vLLM 0.7.3
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.7.3-20250325>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`__
+
+   * - ``rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6``
+     - 
+       * ROCm 6.3.1
+       * vLLM 0.6.6
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.6.6>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`__
+
+   * - ``rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4``
+     - 
+       * ROCm 6.2.1
+       * vLLM 0.6.4
+       * PyTorch 2.5.0
+     - 
+       * :doc:`Documentation <vllm-0.6.4>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`__
+
+   * - ``rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50``
+     - 
+       * ROCm 6.2.0
+       * vLLM 0.4.3
+       * PyTorch 2.4.0
+     - 
+       * :doc:`Documentation <vllm-0.4.3>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`__
+
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -24,6 +24,10 @@ PyTorch inference performance testing
   Supported models
   ================

+   The following models are supported for inference performance benchmarking
+   with PyTorch and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
@@ -31,13 +35,13 @@ PyTorch inference performance testing
          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
-            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>

        <div class="row mt-1" style="display: none;">
-          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
@@ -62,47 +66,52 @@ PyTorch inference performance testing
      {% endfor %}
   {% endfor %}

-   Getting started
-   ===============
+   System validation
+   =================

-   Use the following procedures to reproduce the benchmark results on an
-   MI300X series accelerator with the prebuilt PyTorch Docker image.
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.

-   .. _pytorch-benchmark-get-started:
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.

-   1. Disable NUMA auto-balancing.
+   .. code-block:: shell

-      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-      might hang until the periodic balancing is finalized. For more information,
-      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0

-      .. code-block:: shell
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.

-         # disable automatic NUMA balancing
-         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-         # check if NUMA balancing is disabled (returns 0 if disabled)
-         cat /proc/sys/kernel/numa_balancing
-         0
+   Pull the Docker image
+   =====================

   .. container:: model-doc pyt_chai1_inference

-      2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
+      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.

-         .. code-block:: shell
+      .. code-block:: shell

-            docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
+         docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue

-         .. note::
+      .. note::

-            The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
+         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.

-   .. container:: model-doc pyt_clip_inference
+   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference

-      2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
+      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.

-         .. code-block:: shell
+      .. code-block:: shell

-            docker pull rocm/pytorch:latest
+         docker pull rocm/pytorch:latest
+
+   .. _pytorch-benchmark-get-started:

   Benchmarking
   ============
@@ -131,7 +140,11 @@ PyTorch inference performance testing
      .. code-block:: shell

         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-         python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+         madengine run \
+             --tags {{model.mad_tag}} \
+             --keep-model-dir \
+             --live-output \
+             --timeout 28800

      MAD launches a Docker container with the name
      ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
@@ -142,8 +155,7 @@ PyTorch inference performance testing
         For improved performance, consider enabling TunableOp. By default,
         ``{{model.mad_tag}}`` runs with TunableOp disabled (see
         `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable
-         it, edit the default run behavior in the ``tools/run_models.py``-- update the model's
-         run ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+         it, include the ``--tunableop on`` argument in your run.

         Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
         Although this might increase the initial training time, it can result in a performance gain.
@@ -154,14 +166,19 @@ PyTorch inference performance testing
 Further reading
 ===============

+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`../../inference-optimization/workload`.

 - To learn how to run LLM models from Hugging Face or your model, see
-  :doc:`Running models from Hugging Face <hugging-face-models>`.
+  :doc:`Running models from Hugging Face <../hugging-face-models>`.

 - To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../inference-optimization/index>`.
+  :doc:`Inference optimization <../../inference-optimization/index>`.

 - To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
+  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -0,0 +1,443 @@
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `vLLM <https://docs.vllm.ai/en/latest>`__
+        - {{ unified_docker.vllm_version }}
+
+      * - `PyTorch <https://github.com/ROCm/pytorch>`__
+        - {{ unified_docker.pytorch_version }}
+
+      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+        - {{ unified_docker.hipblaslt_version }}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements>` for
+MI300X series accelerators.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
+
+* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
+  This parameter has been removed from the benchmarking script.
+
+* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
+  This parameter has been removed from the benchmarking script.
+
+* Fixed a ``+rms_norm`` custom kernel issue.
+
+* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
+
+* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   .. _vllm-benchmark-available-models:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+.. _vllm-benchmark-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and latency measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
+               To enable it, include the ``--tunableop on`` argument in your
+               run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed
+               by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            1. Run the vLLM benchmark tool independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`_
+               as shown in the following snippet.
+
+               .. code-block:: shell
+
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}
+
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/vllm
+
+            3. To start the benchmark, use the following command with the appropriate options.
+
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``$test_option``
+                       - latency
+                       - Measure decoding token latency
+
+                     * -
+                       - throughput
+                       - Measure token generation throughput
+
+                     * -
+                       - all
+                       - Measure both throughput and latency
+
+                     * - ``$num_gpu``
+                       - 1 or 8
+                       - Number of GPUs
+
+                     * - ``$datatype``
+                       - ``float16`` or ``float8``
+                       - Data type
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               Command:
+
+               .. code-block::
+
+                  ./vllm_benchmark_report.sh \
+                      -s $test_option \
+                      -m {{model.model_repo}} \
+                      -g $num_gpu \
+                      -d {{model.precision}}
+
+               .. note::
+
+                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+
+                  If you encounter the following error, pass your access-authorized Hugging
+                  Face token to the gated models.
+
+                  .. code-block::
+
+                     OSError: You are trying to access a gated repo.
+
+                     # pass your HF_TOKEN
+                     export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh \
+                     -s latency \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh \
+                     -s throughput \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm/vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/vllm.git
+
+2. Checkout the specific release commit.
+
+   .. code-block:: shell
+
+      cd vllm
+      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
+
+3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
+
+   .. code-block:: shell
+
+      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+
+Known issues and workarounds
+============================
+
+AITER does not support FP8 KV cache yet.
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`previous-versions/vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -14,14 +14,14 @@ Throughout the following topics, this section provides a comprehensive guide to
 The AI Developer Hub contains `AMD ROCm tutorials <https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/>`_ for
 training, fine-tuning, and inference. It leverages popular machine learning frameworks on AMD GPUs.

- :doc:`Installing ROCm and machine learning frameworks <install>`
+- :doc:`Installing ROCm and machine learning frameworks <../install>`

 - :doc:`Running models from Hugging Face <hugging-face-models>`

 - :doc:`LLM inference frameworks <llm-inference-frameworks>`

- :doc:`vLLM inference performance testing <vllm-benchmark>`
+- :doc:`vLLM inference performance testing <benchmark-docker/vllm>`

- :doc:`PyTorch inference performance testing <pytorch-inference-benchmark>`
+- :doc:`PyTorch inference performance testing <benchmark-docker/pytorch-inference>`

 - :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
+++ b/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
@@ -141,7 +141,7 @@ Installing vLLM

   ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
   on the MI300X accelerator. The Docker image includes ROCm, vLLM, and PyTorch.
-   For more information, see :doc:`vllm-benchmark`.
+   For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _fine-tuning-llms-tgi:

--- a/docs/how-to/rocm-for-ai/inference/install.rst
+++ b/docs/how-to/rocm-for-ai/inference/install.rst
@@ -28,9 +28,9 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install

 * :doc:`Using your Linux distribution's package manager <rocm-install-on-linux:install/install-methods/package-manager-index>`

-* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/amdgpu-install>`
+* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/install-methods/amdgpu-installer-index>`

-* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`.
+* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`

 .. grid:: 1

@@ -59,4 +59,8 @@ images with the framework pre-installed.

 * :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`

-The sections that follow in :doc:`Training a model <../training/train-a-model>` are geared for a ROCm with PyTorch installation.
+Next steps
+==========
+
+After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
+to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.
--- a/docs/how-to/rocm-for-ai/system-health-check.rst
+++ b/docs/how-to/rocm-for-ai/system-health-check.rst
@@ -0,0 +1,104 @@
+.. meta::
+   :description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
+   :keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference
+
+.. _rocm-for-ai-system-health-bench:
+
+************************
+System health benchmarks
+************************
+
+Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).
+
+ROCm Validation Suite (RVS) tests
+=================================
+
+RVS provides a collection of tests, benchmarks, and qualification tools, each
+targeting a specific subsystem of the system under test. It includes tests for
+GPU stress and memory bandwidth.
+
+.. _healthcheck-install-rvs:
+
+Install ROCm Validation Suite
+-----------------------------
+
+To get started, install RVS. For example, on an Ubuntu system with ROCm already
+installed, run the following command:
+
+.. code-block:: shell
+
+   sudo apt update
+   sudo apt install rocm-validation-suite
+
+See the `ROCm Validation Suite installation instructions <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/install/installation.html>`_,
+and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#system-validation-tests>`_
+in the Instinct documentation for more detailed instructions.
+
+Benchmark, stress, and qualification tests
+------------------------------------------
+
+The GPU stress test runs various GEMM computations as workloads to stress the GPU FLOPS performance and check whether it
+meets the configured target GFLOPS.
+
+Run the benchmark, stress, and qualification tests included with RVS. See the `Benchmark, stress, qualification
+<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#benchmark-stress-qualification>`_
+section of the Instinct documentation for usage instructions.
+
+BabelStream test
+----------------
+
+BabelStream is a synthetic GPU benchmark based on the STREAM benchmark for
+CPUs, measuring memory transfer rates to and from global device memory.
+BabelStream tests are included with the RVS package as part of the `BABEL module
+<https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/conceptual/rvs-modules.html#babel-benchmark-test-babel-module>`_.
+
+For more information, see `Performance benchmarking
+<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#babelstream-benchmarking-results>`_
+in the Instinct documentation.
+
+RCCL tests
+==========
+
+The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
+communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
+the performance and verifies the correctness of these collective operations.
+This helps ensure optimal scaling for multi-accelerator tasks.
+
+1. To get started, build RCCL-tests using the official instructions in the README at
+   `<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
+   following commands:
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/rccl-tests.git
+      cd rccl-tests
+      make
+
+2. Run the suggested RCCL tests -- see `RCCL benchmarking
+   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#rccl-benchmarking-results>`_
+   in the Instinct performance benchmarking documentation for instructions.
+
+TransferBench test
+==================
+
+TransferBench is a standalone utility for benchmarking simultaneous data
+transfer performance between various devices in the system, including
+CPU-to-GPU and GPU-to-GPU (peer-to-peer). This helps identify potential
+bottlenecks in data movement between the host system and the GPUs, or between
+GPUs, which can impact end-to-end latency.
+
+.. _healthcheck-install-transferbench:
+
+1. To get started, use the instructions in the `TransferBench documentation
+   <https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
+   or use the following commands:
+
+   .. code:: shell
+
+      git clone https://github.com/ROCm/TransferBench.git
+      cd TransferBench
+      CC=hipcc make
+
+2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
+   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
+   in the Instinct performance benchmarking documentation for instructions.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -79,11 +79,18 @@ across different input sequences. Support for packed input format is planned for
 System validation
 =================

-If you have already validated your system settings, including NUMA
-auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
-and optimization steps <train-a-model-system-validation>` to set up your system
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
 before starting training.

+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
 Environment setup
 =================

@@ -175,8 +182,8 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.

 .. _amd-maxtext-download-docker:

-Download the Docker image
-------------------------
+Pull the Docker image
+---------------------

 1. Use the following command to pull the Docker image from Docker Hub.

@@ -367,22 +374,5 @@ own cluster setup.
 Previous versions
 =================

-This table lists previous versions of the ROCm JAX MaxText Docker image for training
-performance testing. For detailed information about available models for
-benchmarking, see the version-specific documentation.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - Image version
-     - ROCm version
-     - JAX version
-     - Resources
-
-   * - 25.4
-     - 6.3.0
-     - 0.4.31
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`_
+See :doc:`previous-versions/jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
@@ -34,11 +34,18 @@ for MPT-30B with access to detailed logs and performance metrics.
 System validation
 =================

-If you have already validated your system settings, including NUMA
-auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
-and optimization steps <train-a-model-system-validation>` to set up your system
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
 before starting training.

+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
 Getting started
 ===============

@@ -66,7 +73,11 @@ document are not validated.

      .. code-block:: shell

-         python3 tools/run_models.py --tags pyt_mpt30b_training --keep-model-dir --live-output --clean-docker-cache
+         madengine run \
+             --tags pyt_mpt30b_training \
+             --keep-model-dir \
+             --live-output \
+             --clean-docker-cache

      .. tip::

@@ -83,7 +94,7 @@ document are not validated.

         For improved performance (training throughput), consider enabling TunableOp.
         By default, ``pyt_mpt30b_training`` runs with TunableOp disabled. To enable it,
-         run ``tools/run_models.py`` with the ``--tunableop on`` argument or edit the
+         run ``madengine run`` with the ``--tunableop on`` argument or edit the
         ``models.json`` configuration before running training.

         Although this might increase the initial training time, it can result in a performance gain.
@@ -165,4 +176,13 @@ Key performance metrics include:

    Overall training loss. A decreasing trend indicates the model is learning effectively.

+Further reading
+===============

+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -0,0 +1,34 @@
+:orphan:
+
+********************************************************
+JAX MaxText training performance testing version history
+********************************************************
+
+This table lists previous versions of the ROCm JAX MaxText Docker image for training
+performance testing. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+You can find tagged
+previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/jax-training/tags>`_.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Image version
+     - Components
+     - Resources
+
+   * - 25.5 (latest)
+     - 
+       * ROCm 6.3.4
+       * JAX 0.4.35
+     - 
+       * :doc:`Documentation <../jax-maxtext>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`_
+
+   * - 25.4
+     - 
+       * ROCm 6.3.0
+       * JAX 0.4.31
+     - 
+       * :doc:`Documentation <jax-maxtext-v25.4>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
@@ -0,0 +1,358 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using JAX MaxText for ROCm.
+   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with MaxText for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm JAX MaxText
+   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
+
+MaxText is a high-performance, open-source framework built on the Google JAX
+machine learning library to train LLMs at scale. The MaxText framework for
+ROCm is an optimized fork of the upstream
+`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
+on AMD MI300X series accelerators.
+
+The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.4``) image
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
+including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
+It includes the following software components:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| JAX                      | 0.4.31                         |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+f81a3eb            |
+--------------------------+--------------------------------+
+| hipBLASLt                | git78ec8622                    |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+MaxText provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- Flash Attention (FA) 3
+
+- GEMM tuning
+
+- Multi-node support
+
+.. _amd-maxtext-model-support:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+Unsupported features
+--------------------
+
+Currently, MaxText's default packed input format is not supported. Using this format
+with the current Docker image results in incorrect attention calculations
+across different input sequences. Support for packed input format is planned for a future release.
+
+System validation
+=================
+
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
+before starting training.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+as follows. Performance can vary for other training workloads, as AMD
+doesn’t validate configurations and run conditions outside those described.
+
+.. _amd-maxtext-multi-node-setup:
+
+Multi-node setup
+----------------
+
+For multi-node environments, ensure you have all the necessary packages for
+your network device, such as, RDMA. If you're not using a multi-node setup
+with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+
+1. Install the following packages to build and install the RDMA driver.
+
+   .. code-block:: shell
+
+      sudo apt install iproute2 -y
+      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
+      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
+
+   Refer to your NIC manufacturer's documentation for further steps on
+   compiling and installing the RoCE driver. For example, for Broadcom,
+   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
+   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
+
+2. Set the following environment variables.
+
+   a. Master address
+
+      Change `localhost` to the master node's resolvable hostname or IP address:
+
+      .. code-block:: bash
+
+         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   b. Number of nodes
+
+      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+      .. code-block:: bash
+
+         export NNODES="${NNODES:-1}"
+
+   c. Node ranks
+
+      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
+      Node ranks should be unique across all nodes in the cluster.
+
+      .. code-block:: bash
+
+         export NODE_RANK="${NODE_RANK:-0}"
+
+   d. Network interface
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface with an IP address in the same subnet as
+      your other nodes. Then, update the following variable in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      This variable specifies which network interface to use for inter-node communication.
+      Setting this variable to the incorrect interface can result in communication failures
+      or significantly reduced performance.
+
+   e. RDMA interface
+
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Then, set the RDMA interfaces to use for communication.
+
+      .. code-block:: bash
+
+         # If using Broadcom NIC
+         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+         # If using Mellanox NIC
+         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. _amd-maxtext-download-docker:
+
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/jax-training:maxtext-v25.4
+
+2. Run the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.4
+
+.. _amd-maxtext-get-started:
+
+Getting started
+===============
+
+The following examples demonstrate how to get started with single node
+and multi-node training using the benchmarking scripts provided at
+`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+
+.. important::
+
+   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+
+Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
+set correctly and points to your Hugging Face cache directory. Refer to the
+README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
+for more detailed instructions.
+
+Single node training benchmarking examples
+------------------------------------------
+
+* Example 1: Single node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
+
+  Run the single node training benchmark:
+
+  IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_7b.sh
+
+* Example 2: Single node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_70b.sh
+
+* Example 3: Single node training with Llama 3 8B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_8b.sh
+
+* Example 4: Single node training with Llama 3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_70b.sh
+
+* Example 5: Single node training with DeepSeek V2 16B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./deepseek_v2_16b.sh
+
+  .. note::
+
+     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
+     the tokens/s as a performance indicator.
+
+Multi-node training benchmarking examples
+-----------------------------------------
+
+The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
+own cluster setup.
+
+* Example 1: Multi-node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_7b_multinode.sh
+
+* Example 2: Multi-node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_70b_multinode.sh
+
+* Example 3: Multi-node training with Llama 3 8B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_8b_multinode.sh
+
+* Example 4: Multi-node training with Llama 3 70B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_70b_multinode.sh
+
+Previous versions
+=================
+
+See :doc:`jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -0,0 +1,59 @@
+:orphan:
+
+********************************************************
+Megatron-LM training performance testing version history
+********************************************************
+
+This table lists previous versions of the ROCm Megatron-LM training Docker image for
+inference performance testing. For detailed information about available models
+for benchmarking, see the version-specific documentation. You can find tagged
+previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/megatron-lm/tags>`_.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Image version
+     - Components
+     - Resources
+
+   * - v25.6 (latest)
+     - 
+       * ROCm 6.4.1
+       * PyTorch 2.8.0a0+git7d205b2
+     - 
+       * :doc:`Documentation <../megatron-lm>`
+       * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
+       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
+
+   * - v25.5
+     - 
+       * ROCm 6.3.4
+       * PyTorch 2.8.0a0+gite2f9759
+     - 
+       * :doc:`Documentation <megatron-lm-v25.5>`
+       * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`__
+       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py310/images/sha256-743fbf1ceff7a44c4452f938d783a7abf143737d1c15b2b95f6f8a62e0fd048b>`__
+
+   * - v25.4
+     - 
+       * ROCm 6.3.0
+       * PyTorch 2.7.0a0+git637433 
+     - 
+       * :doc:`Documentation <megatron-lm-v25.4>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`_
+
+   * - v25.3
+     - 
+       * ROCm 6.3.0
+       * PyTorch 2.7.0a0+git637433 
+     - 
+       * :doc:`Documentation <megatron-lm-v25.3>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`_
+
+   * - v24.12-dev
+     - 
+       * ROCm 6.1.0
+       * PyTorch 2.4.0
+     - 
+       * :doc:`Documentation <megatron-lm-v24.12-dev>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -0,0 +1,516 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using ROCm Megatron-LM
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+**************************************
+Training a model with ROCm Megatron-LM
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+.. _amd-megatron-lm:
+
+The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
+enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
+accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
+workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
+like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
+efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
+
+For ease of use, AMD provides a ready-to-use Docker image for MI300X accelerators containing essential
+components, including PyTorch, PyTorch Lightning, ROCm libraries, and Megatron-LM utilities. It contains the
+following software to accelerate training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.1                            |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.4.0                          |
+--------------------------+--------------------------------+
+| PyTorch Lightning        | 2.4.0                          |
+--------------------------+--------------------------------+
+| Megatron Core            | 0.9.0                          |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.5.0                          |
+--------------------------+--------------------------------+
+| Flash Attention          | v2.6                           |
+--------------------------+--------------------------------+
+| Transformers             | 4.44.0                         |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+Megatron-LM provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- APEX
+
+- GEMM tuning
+
+- Torch.compile
+
+- 3D parallelism: TP + SP + CP
+
+- Distributed optimizer
+
+- Flash Attention (FA) 2
+
+- Fused kernels
+
+- Pre-training
+
+.. _amd-megatron-lm-model-support:
+
+The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+Prerequisite system validation steps
+====================================
+
+Complete the following system validation and optimization steps to set up your system before starting training.
+
+Disable NUMA auto-balancing
+---------------------------
+
+Generally, application performance can benefit from disabling NUMA auto-balancing. However,
+it might be detrimental to performance with certain types of workloads.
+
+Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
+Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
+the output is ``1``, run the following command to disable NUMA auto-balancing.
+
+.. code-block:: shell
+
+   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+
+See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
+for more information.
+
+Hardware verification with ROCm
+-------------------------------
+
+Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
+instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
+GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
+You can restore this setting to its default value with the ``rocm-smi -r`` command.
+
+Run the command:
+
+.. code-block:: shell
+
+   rocm-smi --setperfdeterminism 1900
+
+See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.
+
+RCCL Bandwidth Test
+-------------------
+
+ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
+routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
+pre-training, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
+for efficient distributed training.
+
+Running the RCCL bandwidth test helps verify that:
+
+- The GPUs can communicate across nodes or within a single node.
+
+- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
+  provides adequate bandwidth for communication.
+
+- No hardware setup or cabling issues could affect the communication between GPUs
+
+Tuning and optimizing hyperparameters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In distributed training, specific hyperparameters related to distributed communication can be tuned based on
+the results of the RCCL bandwidth test. These variables are already set in the Docker image:
+
+.. code-block:: shell
+
+   # force all RCCL streams to be high priority
+   export TORCH_NCCL_HIGH_PRIORITY=1
+
+   # specify which RDMA interfaces to use for communication
+   export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+
+   # define the Global ID index used in RoCE mode
+   export NCCL_IB_GID_INDEX=3
+
+   # avoid data corruption/mismatch issue that existed in past releases
+   export RCCL_MSCCL_ENABLE=0
+
+Running the RCCL Bandwidth Test
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It's recommended you run the RCCL bandwidth test before launching training. It ensures system
+performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
+image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
+See :ref:`mi300x-rccl` for more information.
+
+Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
+
+.. code-block:: shell
+
+   ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
+
+.. image:: /data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
+   :width: 800
+
+Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
+recommended. So, a run on 8 GPUs looks something like:
+
+.. code-block:: shell
+
+   mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
+
+.. image:: /data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
+   :width: 800
+
+Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
+for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
+PyTorch and TensorFlow.
+
+Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
+
+.. code-block::
+
+   /home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
+   --mca pml ucx \
+   --mca btl ^openib \
+   -x NCCL_SOCKET_IFNAME=ens50f0np0 \
+   -x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
+   -x NCCL_IB_GID_INDEX=3 \
+   -x NCCL_MIN_NCHANNELS=40 \
+   -x NCCL_DEBUG=version \
+   $HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
+
+.. image:: /data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
+   :width: 800
+
+.. _mi300x-amd-megatron-lm-training:
+
+Start training on MI300X accelerators
+=====================================
+
+The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
+training benchmarks, and achieve superior performance for models like Llama 2 and Llama 3.1.
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements:
+
+Download the Docker image and required packages
+-----------------------------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/megatron-lm:24.12-dev
+
+2. Launch the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $CACHE_DIR:/root/.cache --name megatron-dev-env rocm/megatron-lm:24.12-dev /bin/bash
+
+3. Clone the ROCm Megatron-LM repository to a local directory and install the required packages on the host machine.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/Megatron-LM
+      cd Megatron-LM
+
+   .. note::
+
+      This release is validated with ``ROCm/Megatron-LM`` commit `bb93ccb <https://github.com/ROCm/Megatron-LM/tree/bb93ccbfeae6363c67b361a97a27c74ab86e7e92>`_.
+      Checking out this specific commit is recommended for a stable and reproducible environment.
+
+      .. code-block:: shell
+         
+         git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
+
+Prepare training datasets
+-------------------------
+
+If you already have the preprocessed data, you can skip this section.
+
+Use the following command to process datasets. We use GPT data as an example. You may change the merge table, use an
+end-of-document token, remove sentence splitting, and use the tokenizer type.
+
+.. code-block:: shell
+
+   python tools/preprocess_data.py \
+       --input my-corpus.json \
+       --output-prefix my-gpt2 \
+       --vocab-file gpt2-vocab.json \
+       --tokenizer-type GPT2BPETokenizer \
+       --merge-file gpt2-merges.txt \
+       --append-eod
+
+In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
+``my-gpt2_text_document.idx``.
+
+.. image:: /data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
+   :width: 800
+
+.. _amd-megatron-lm-environment-setup:
+
+Environment setup
+-----------------
+
+In the ``examples/llama`` directory of Megatron-LM, if you're working with Llama 2 7B or Llama 2 70 B, use the
+``train_llama2.sh`` configuration script. Likewise, if you're working with Llama 3 or Llama 3.1, then use
+``train_llama3.sh`` and update the configuration script accordingly.
+
+Network interface
+^^^^^^^^^^^^^^^^^
+
+To avoid connectivity issues, ensure the correct network interface is set in your training scripts.
+
+1. Run the following command to find the active network interface on your system.
+
+   .. code-block:: shell
+
+      ip a
+
+2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
+   example:
+
+   .. code-block:: shell
+
+      export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      export GLOO_SOCKET_IFNAME=ens50f0np0
+
+Dataset options
+^^^^^^^^^^^^^^^
+
+You can use either mock data or real data for training.
+
+* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+  .. code-block:: shell
+
+     DATA_DIR="/root/.cache/data" # Change to where your dataset is stored
+
+     DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence
+
+  .. code-block:: shell
+
+     --data-path $DATA_PATH
+
+  Ensure that the files are accessible inside the Docker container.
+
+* Mock data can be useful for testing and validation. If you're using mock data, replace ``--data-path $DATA_PATH`` with the ``--mock-data`` option.
+
+  .. code-block:: shell
+
+     --mock-data
+
+Tokenizer
+^^^^^^^^^
+
+Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
+models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
+a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
+fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
+handle a variety of input sequences, including unseen words or domain-specific terms.
+
+To train any of the Llama 2 models that this Docker image supports, use the ``Llama2Tokenizer``.
+
+To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
+Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
+
+For example, if you're using the Llama 3.1 8B model:
+
+.. code-block:: shell
+
+   TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
+
+Run benchmark tests
+-------------------
+
+.. note::
+
+   If you're running **multi node training**, update the following environment variables. They can
+   also be passed as command line arguments.
+
+   * Change ``localhost`` to the master node's hostname:
+
+     .. code-block:: shell
+
+        MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+     .. code-block:: shell
+
+        NNODES="${NNODES:-1}"
+
+   * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+     .. code-block:: shell
+
+        NODE_RANK="${NODE_RANK:-0}"
+
+* Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
+
+  .. code-block:: shell
+
+     {variables} bash examples/llama/train_llama2.sh
+
+* Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
+
+  .. code-block:: shell
+
+     {variables} bash examples/llama/train_llama3.sh
+
+.. _amd-megatron-lm-benchmark-test-vars:
+
+The benchmark tests support the same set of variables:
+
+--------------------------+-----------------------+-----------------------+
+| Name                     | Options               | Description           |
+==========================+=======================+=======================+
+| ``TEE_OUTPUT``           | 0 or 1                | 0: disable training   |
+|                          |                       | log                   |
+|                          |                       |                       |
+|                          |                       | 1: enable training    |
+|                          |                       | log                   |
+--------------------------+-----------------------+-----------------------+
+| ``MBS``                  |                       | Micro batch size      |
+--------------------------+-----------------------+-----------------------+
+| ``BS``                   |                       | Batch size            |
+--------------------------+-----------------------+-----------------------+
+| ``TP``                   | 1, 2, 4, 8            | Tensor parallel       |
+--------------------------+-----------------------+-----------------------+
+| ``TE_FP8``               | 0 or 1                | Datatype.             |
+|                          |                       | If it is set to 1,    |
+|                          |                       | FP8.                  |
+|                          |                       |                       |
+|                          |                       | If it is set to 0.    |
+|                          |                       | BP16                  |
+--------------------------+-----------------------+-----------------------+
+| ``NO_TORCH_COMPILE``     | 0 or 1                | If it is set to 1,    |
+|                          |                       | enable torch.compile. |
+|                          |                       |                       |
+|                          |                       | If it is set to 0.    |
+|                          |                       | Disable torch.compile |
+|                          |                       | (default)             |
+--------------------------+-----------------------+-----------------------+
+| ``SEQ_LENGTH``           |                       | Input sequence length |
+--------------------------+-----------------------+-----------------------+
+| ``GEMM_TUNING``          | 0 or 1                | If it is set to 1,    |
+|                          |                       | enable gemm tuning.   |
+|                          |                       |                       |
+|                          |                       | If it is set to 0,    |
+|                          |                       | disable gemm tuning   |
+--------------------------+-----------------------+-----------------------+
+| ``USE_FLASH_ATTN``       | 0 or 1                | 0: disable flash      |
+|                          |                       | attention             |
+|                          |                       |                       |
+|                          |                       | 1: enable flash       |
+|                          |                       | attention             |
+--------------------------+-----------------------+-----------------------+
+| ``ENABLE_PROFILING``     | 0 or 1                | 0: disable torch      |
+|                          |                       | profiling             |
+|                          |                       |                       |
+|                          |                       | 1: enable torch       |
+|                          |                       | profiling             |
+--------------------------+-----------------------+-----------------------+
+| ``MODEL_SIZE``           |                       | The size of the mode: |
+|                          |                       | 7B/70B, etc.          |
+--------------------------+-----------------------+-----------------------+
+| ``TOTAL_ITERS``          |                       | Total number of       |
+|                          |                       | iterations            |
+--------------------------+-----------------------+-----------------------+
+| ``transformer-impl``     | transformer_engine or | Enable transformer    |
+|                          | local                 | engine by default     |
+--------------------------+-----------------------+-----------------------+
+
+Benchmarking examples
+^^^^^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Single node training
+      :sync: single
+
+      Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
+      datatype, and so on.
+
+      .. code-block:: bash
+
+         TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+      You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+      See the sample output:
+
+      .. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
+         :width: 800
+
+   .. tab-item:: Multi node training
+      :sync: multi
+
+      Launch the Docker container on each node.
+
+      In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
+      so on.
+
+      On the master node:
+
+      .. code-block:: bash
+
+         TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+      On the worker node:
+
+      .. code-block:: bash
+
+         TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+      You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+      Sample output for 2-node training:
+
+      Master node:
+
+      .. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
+         :width: 800
+
+      Worker node:
+
+      .. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
+         :width: 800
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -0,0 +1,536 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+******************************************
+Training a model with Megatron-LM for ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
+designed to enable efficient training of large-scale language models on AMD
+GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
+enhanced scalability, performance, and resource utilization for AI workloads.
+It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
+DeepSeek, enabling developers to train next-generation AI models more
+efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
+
+AMD provides a ready-to-use Docker image for MI300X accelerators containing
+essential components, including PyTorch, ROCm libraries, and Megatron-LM
+utilities. It contains the following software components to accelerate training
+workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.11                           |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git258a2162                    |
+--------------------------+--------------------------------+
+| Triton                   | 3.1                            |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+Megatron-LM provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- APEX
+
+- GEMM tuning
+
+- Torch.compile
+
+- 3D parallelism: TP + SP + CP
+
+- Distributed optimizer
+
+- Flash Attention (FA) 3
+
+- Fused kernels
+
+- Pre-training
+
+.. _amd-megatron-lm-model-support:
+
+The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+If you have already validated your system settings, skip this step. Otherwise,
+complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
+to set up your system before starting training.
+
+Disable NUMA auto-balancing
+---------------------------
+
+Generally, application performance can benefit from disabling NUMA auto-balancing. However,
+it might be detrimental to performance with certain types of workloads.
+
+Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
+Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
+the output is ``1``, run the following command to disable NUMA auto-balancing.
+
+.. code-block:: shell
+
+   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+
+See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
+for more information.
+
+.. _mi300x-amd-megatron-lm-training:
+
+Environment setup
+=================
+
+The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
+training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements:
+ 
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/megatron-lm:v25.3
+
+2. Launch the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.3
+
+3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start megatron_training_env
+      docker exec -it megatron_training_env bash
+
+The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
+
+.. _amd-megatron-lm-environment-setup:
+
+Configuration scripts
+---------------------
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
+      script in the ``examples/llama`` directory of
+      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/llama>`__.
+      Likewise, if you're working with Llama 3 or Llama 3.1, then use ``train_llama3.sh`` and update
+      the configuration script accordingly.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
+      directory of
+      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/deepseek_v2>`__
+      and update the configuration script accordingly.
+
+Network interface
+^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      To avoid connectivity issues in multi-node deployments, ensure the correct network interface
+      is set in your training scripts.
+
+      1. Run the following command (outside the container) to find the active network interface on your system.
+
+         .. code-block:: shell
+
+            ip a
+
+      2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
+         example:
+
+         .. code-block:: shell
+
+            export NCCL_SOCKET_IFNAME=ens50f0np0
+
+            export GLOO_SOCKET_IFNAME=ens50f0np0
+
+Dataset options
+^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      You can use either mock data or real data for training.
+
+      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+        value is ``1`` for enabled.
+
+        .. code-block:: bash
+
+           MOCK_DATA=1
+
+      * If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+        .. code-block:: bash
+
+           MOCK_DATA=0
+
+           DATA_PATH=${DATA_PATH:-"/data/bookcorpus_text_sentence"}  # Change to where your dataset is stored
+
+        Ensure that the files are accessible inside the Docker container.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      If you don't already have the dataset, download the DeepSeek dataset using the following
+      commands:
+
+      .. code-block:: shell
+
+         mkdir deepseek-datasets
+         cd deepseek-datasets
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+
+      You can use either mock data or real data for training.
+
+      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+        value is ``1`` for enabled.
+
+        .. code-block:: bash
+
+           MOCK_DATA=1
+
+      * If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+        .. code-block:: bash
+
+           MOCK_DATA=0
+
+           DATA_DIR="/root/data/deepseek-datasets"  # Change to where your dataset is stored
+
+        Ensure that the files are accessible inside the Docker container.
+
+Tokenizer
+^^^^^^^^^
+
+Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
+models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
+a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
+fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
+handle a variety of input sequences, including unseen words or domain-specific terms.
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
+
+      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
+      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
+
+      For example, if you're using the Llama 3.1 8B model:
+
+      .. code-block:: shell
+
+         TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+
+Multi-node training
+^^^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      If you're running multi-node training, update the following environment variables. They can
+      also be passed as command line arguments.
+
+      * Change ``localhost`` to the master node's hostname:
+
+        .. code-block:: shell
+
+           MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+      * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+        .. code-block:: shell
+
+           NNODES="${NNODES:-1}"
+
+      * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+        .. code-block:: shell
+
+           NODE_RANK="${NODE_RANK:-0}"
+
+      * Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
+        NFS directory) for multi-node runs:
+
+        .. code-block:: shell
+
+           DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
+
+      * For multi-node runs, make sure the correct network drivers are installed on the nodes. If
+        inside a Docker, either install the drivers inside the Docker container or pass the network
+        drivers from the host while creating the Docker container.
+
+Start training on AMD Instinct accelerators
+===========================================
+
+The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama 3.1 and Llama 2. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+Use the following instructions to set up the environment, configure the script
+to train models, and reproduce the benchmark results on MI300X series
+accelerators with the AMD Megatron-LM Docker image.
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      .. tab-set::
+
+         .. tab-item:: Single node training
+            :sync: single-node
+
+            To run training on a single node, navigate to the Megatron-LM folder and use the
+            following command:
+
+            .. code-block:: shell
+
+               TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 bash examples/llama/train_llama3.sh
+
+         .. tab-item:: Multi-node training
+            :sync: multi-node
+
+            To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
+
+            * On the master node ``NODE0``:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
+
+            * On the worker node ``NODE1``:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
+
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
+
+      .. code-block:: shell
+
+         cd /workspace/Megatron-LM
+         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none bash examples/deepseek_v2/train_deepseekv2.sh
+
+Key options
+-----------
+
+.. _amd-megatron-lm-benchmark-test-vars:
+
+The benchmark tests support the following sets of variables:
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      ``TEE_OUTPUT``
+        ``1`` to enable training logs or ``0`` to disable.
+
+      ``TE_FP8``
+        ``0`` for BP16 (default) or ``1`` for FP8 GEMMs.
+
+      ``GEMM_TUNING``
+        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+      ``USE_FLASH_ATTN``
+        ``1`` to enable Flash Attention.
+
+      ``ENABLE_PROFILING``
+        ``1`` to enable PyTorch profiling for performance analysis.
+
+      ``transformer-impl``
+        ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
+
+      ``MODEL_SIZE``
+        ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
+
+      ``TOTAL_ITERS``
+        The total number of iterations -- ``10`` by default.
+
+      ``MOCK_DATA``
+        ``1`` to use mock data or ``0`` to use real data provided by you.
+
+      ``MBS``
+        Micro batch size.
+
+      ``BS``
+        Global batch size.
+
+      ``TP``
+        Tensor parallel (``1``, ``2``, ``4``, ``8``).
+
+      ``SEQ_LENGTH``
+        Input sequence length.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      ``PR``
+        Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
+
+      ``GEMM_TUNING``
+        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+      ``TOTAL_ITERS``
+        The total number of iterations -- ``10`` by default.
+
+      ``MOCK_DATA``
+        ``1`` to use mock data or ``0`` to use real data provided by you.
+
+      ``MBS``
+        Micro batch size.
+
+      ``GBS``
+        Global batch size.
+
+Benchmarking examples
+---------------------
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      .. tab-set::
+
+         .. tab-item:: Single node training
+            :sync: single-node
+
+            Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
+            datatype, and so on.
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+            See the sample output:
+
+            .. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
+               :width: 800
+
+         .. tab-item:: Multi-node training
+            :sync: multi-node
+
+            Launch the Docker container on each node.
+
+            In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
+            so on.
+
+            On the master node:
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            On the worker node:
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+            Sample output for 2-node training:
+
+            Master node:
+
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
+               :width: 800
+
+            Worker node:
+
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
+               :width: 800
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -0,0 +1,618 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+******************************************
+Training a model with Megatron-LM for ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
+designed to enable efficient training of large-scale language models on AMD
+GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
+enhanced scalability, performance, and resource utilization for AI workloads.
+It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
+DeepSeek, enabling developers to train next-generation AI models more
+efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
+
+AMD provides a ready-to-use Docker image for MI300X series accelerators containing
+essential components, including PyTorch, ROCm libraries, and Megatron-LM
+utilities. It contains the following software components to accelerate training
+workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.11                           |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git258a2162                    |
+--------------------------+--------------------------------+
+| Triton                   | 3.1                            |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+Megatron-LM provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- APEX
+
+- GEMM tuning
+
+- Torch.compile
+
+- 3D parallelism: TP + SP + CP
+
+- Distributed optimizer
+
+- Flash Attention (FA) 3
+
+- Fused kernels
+
+- Pre-training
+
+.. _amd-megatron-lm-model-support:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-megatron-lm-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the :doc:`latest version of this training benchmarking environment <../megatron-lm>`_.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
+before starting training.
+
+.. _mi300x-amd-megatron-lm-training:
+
+Environment setup
+=================
+
+The prebuilt ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
+training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements:
+ 
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/megatron-lm:v25.4
+
+2. Launch the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.4
+
+3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start megatron_training_env
+      docker exec -it megatron_training_env bash
+
+The Docker container includes a pre-installed, verified version of the ROCm Megatron-LM development branch `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__
+(commit `fd6f01 <https://github.com/ROCm/Megatron-LM/tree/fd6f0d11d7f9480ace32f22eb7e4dab5314fa350>`_).
+
+.. _amd-megatron-lm-environment-setup:
+
+Configuration scripts
+---------------------
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
+      script in the ``examples/llama`` directory of
+      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__.
+      Likewise, if you're working with Llama 3 or Llama 3.1, use ``train_llama3.sh`` and update
+      the configuration script accordingly.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
+      directory of
+      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__
+      and update the configuration script accordingly.
+
+Network interface
+^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface that has an IP address in the same subnet as
+      your other nodes. Then, update the following variables in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+         export GLOO_SOCKET_IFNAME=ens50f0np0
+
+Dataset options
+^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      You can use either mock data or real data for training.
+
+      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+        value is ``1`` for enabled.
+
+        .. code-block:: bash
+
+           MOCK_DATA=1
+
+      * If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+        .. code-block:: bash
+
+           MOCK_DATA=0
+
+           DATA_PATH="/data/bookcorpus_text_sentence"  # Change to where your dataset is stored
+
+        Ensure that the files are accessible inside the Docker container.
+
+        To download the dataset, set the ``DATASET`` variable to the dataset you'd like to use. Two datasets are supported: ``DATASET=wiki`` and ``DATASET=bookcorpus``.
+        Use the following command to download the dataset.
+
+        .. code-block:: shell
+
+           DATASET=wiki bash examples/llama/prepare_dataset.sh # For wiki-en dataset
+           DATASET=bookcorpus bash examples/llama/prepare_dataset.sh # For bookcorpus dataset
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      If you don't already have the dataset, download the DeepSeek dataset using the following
+      commands:
+
+      .. code-block:: shell
+
+         mkdir deepseek-datasets
+         cd deepseek-datasets
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+
+      You can use either mock data or real data for training.
+
+      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+        value is ``1`` for enabled.
+
+        .. code-block:: bash
+
+           MOCK_DATA=1
+
+      * If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+        .. code-block:: bash
+
+           MOCK_DATA=0
+
+           DATA_DIR="/root/data/deepseek-datasets"  # Change to where your dataset is stored
+
+        Ensure that the files are accessible inside the Docker container.
+
+Tokenizer
+^^^^^^^^^
+
+Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
+models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
+a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
+fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
+handle a variety of input sequences, including unseen words or domain-specific terms.
+
+You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
+If the tokenizer is not found, it'll be downloaded to the default tokenizer model path: ``${DATA_DIR}/tokenizer_llama3``
+or ``${DATA_DIR}/tokenizer_llama2``.
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
+      or the default ``HuggingFaceTokenizer``.
+
+      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
+      Set the Hugging Face model path in the ``TOKENIZER_MODEL`` variable.
+
+      For example, if you're using the Llama 3.1 8B model:
+
+      .. code-block:: shell
+
+         TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
+
+      .. note::
+
+         If you don't already have the Llama 3.1 tokenizer locally, set your
+         personal Hugging Face access token ``HF_TOKEN`` to download the
+         tokenizer. If you encounter the following error, set ``HF_TOKEN`` to
+         your access-authorized Hugging Face token.
+
+         .. code-block:: shell
+
+            OSError: You are trying to access a gated repo.
+
+            # pass your HF_TOKEN
+            export HF_TOKEN=$your_personal_hf_token
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+
+Multi-node training
+^^^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      If you're running multi-node training, update the following environment variables. They can
+      also be passed as command line arguments.
+
+      * Change ``localhost`` to the master node's hostname:
+
+        .. code-block:: shell
+
+           MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+      * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+        .. code-block:: shell
+
+           NNODES="${NNODES:-1}"
+
+      * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+        .. code-block:: shell
+
+           NODE_RANK="${NODE_RANK:-0}"
+
+      * Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
+        NFS directory) for multi-node runs:
+
+        .. code-block:: shell
+
+           DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
+
+      * For multi-node runs, make sure the correct network drivers are installed on the nodes. If
+        inside a Docker container, either install the drivers inside the Docker container or pass the network
+        drivers from the host while creating the Docker container.
+
+        .. code-block:: shell
+
+           # Specify which RDMA interfaces to use for communication
+           export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+
+Start training on AMD Instinct accelerators
+===========================================
+
+The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama 3.1 and Llama 2. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+Use the following instructions to set up the environment, configure the script
+to train models, and reproduce the benchmark results on MI300X series
+accelerators with the AMD Megatron-LM Docker image.
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      .. tab-set::
+
+         .. tab-item:: Single node training
+            :sync: single-node
+
+            To run training on a single node, navigate to the Megatron-LM folder and use one of the
+            following commands.
+
+            - For Llama 3.1 8B FP8:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+            - For Llama 3.1 8B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+            - For Llama 2 7B FP8:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+            - For Llama 2 7B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+            To run training with FSDP2 enabled, add the ``FSDP=1`` argument. For example:
+
+            - For Llama 3 70B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+            - For Llama 2 70B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=3 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+            .. note::
+
+               It's suggested to use ``TP=1`` when FSDP is enabled for higher throughput. FSDP2 is not supported with pipeline parallelism,
+               expert parallelism, MCore's distributed optimizer, gradient accumulation fusion, and ``FP16`` precision.
+
+         .. tab-item:: Multi-node training
+            :sync: multi-node
+
+            To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
+
+            * On the master node ``NODE0``:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
+
+            * On the worker node ``NODE1``:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
+
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
+
+      .. code-block:: shell
+
+         cd /workspace/Megatron-LM
+         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh
+
+Key options
+-----------
+
+.. _amd-megatron-lm-benchmark-test-vars:
+
+The benchmark tests support the following sets of variables:
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      ``TEE_OUTPUT``
+        ``1`` to enable training logs or ``0`` to disable.
+
+      ``TE_FP8``
+        ``0`` for B16 or ``1`` for FP8 -- ``0`` by default.
+
+      ``GEMM_TUNING``
+        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+      ``USE_FLASH_ATTN``
+        ``1`` to enable Flash Attention.
+
+      ``FSDP``
+        ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
+        ``--overlap-param-gather``, and ``--sequence-parallel`` are automaticallyu disabled.
+
+      ``ENABLE_PROFILING``
+        ``1`` to enable PyTorch profiling for performance analysis.
+
+      ``transformer-impl``
+        ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
+
+      ``MODEL_SIZE``
+        ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
+
+      ``TOTAL_ITERS``
+        The total number of iterations -- ``10`` by default.
+
+      ``MOCK_DATA``
+        ``1`` to use mock data or ``0`` to use real data you provide.
+
+      ``MBS``
+        Micro batch size.
+
+      ``BS``
+        Global batch size.
+
+      ``TP``
+        Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.
+
+      ``SEQ_LENGTH``
+        Input sequence length.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      ``PR``
+        Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
+
+      ``GEMM_TUNING``
+        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+      ``TRAIN_ITERS``
+        The total number of iterations.
+
+      ``MOCK_DATA``
+        ``1`` to use mock data or ``0`` to use real data you provide.
+
+      ``MBS``
+        Micro batch size.
+
+      ``GBS``
+        Global batch size.
+
+      ``SEQ_LEN``
+        Input sequence length.
+
+      ``AC``
+        Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
+
+Benchmarking examples
+---------------------
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      .. tab-set::
+
+         .. tab-item:: Single node training
+            :sync: single-node
+
+            Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
+            datatype, and so on.
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+            See the sample output:
+
+            .. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
+               :width: 800
+
+         .. tab-item:: Multi-node training
+            :sync: multi-node
+
+            Launch the Docker container on each node.
+
+            In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
+            so on.
+
+            On the master node:
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            On the worker node:
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+            Sample output for 2-node training:
+
+            Master node:
+
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
+               :width: 800
+
+            Worker node:
+
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
+               :width: 800
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5.rst
@@ -0,0 +1,775 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+******************************************
+Training a model with Megatron-LM for ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
+a specialized fork of the robust Megatron-LM, designed to enable efficient
+training of large-scale language models on AMD GPUs. By leveraging AMD
+Instinct™ MI300X series accelerators, Megatron-LM delivers enhanced
+scalability, performance, and resource utilization for AI workloads. It is
+purpose-built to support models like Llama, DeepSeek, and Mixtral,
+enabling developers to train next-generation AI models more
+efficiently.
+
+AMD provides a ready-to-use Docker image for MI300X series accelerators containing
+essential components, including PyTorch, ROCm libraries, and Megatron-LM
+utilities. It contains the following software components to accelerate training
+workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.8.0a0+gite2f9759             |
+--------------------------+--------------------------------+
+| Python                   | 3.12 or 3.10                   |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.13.0+bb061ade                |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.13.0-4f18bf6                 |
+--------------------------+--------------------------------+
+| Triton                   | 3.3.0                          |
+--------------------------+--------------------------------+
+| RCCL                     | 2.22.3                         |
+--------------------------+--------------------------------+
+
+Megatron-LM provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- APEX
+
+- GEMM tuning
+
+- Torch.compile
+
+- 3D parallelism: TP + SP + CP
+
+- Distributed optimizer
+
+- Flash Attention (FA) 3
+
+- Fused kernels
+
+- Pre-training
+
+.. _amd-megatron-lm-model-support-v255:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
+
+   Supported models
+   ================
+
+   The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
+   Some instructions, commands, and training recommendations in this documentation might
+   vary by model -- select one to get started.
+
+   {% set model_groups = data["megatron-lm_benchmark"].model_groups %}
+
+   .. raw:: html
+
+         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+           <div class="row">
+             <div class="col-2 me-2 model-param-head">Model</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+               <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+             </div>
+           </div>
+
+           <div class="row mt-1">
+             <div class="col-2 me-2 model-param-head">Model variant</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+             </div>
+           </div>
+         </div>
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-megatron-lm-performance-measurements-v255:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`__
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
+   only reflects the latest version of this training benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. _mi300x-amd-megatron-lm-training-v255:
+
+Environment setup
+=================
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements-v255:
+ 
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. tab-set:: 
+
+      .. tab-item:: Ubuntu 24.04 + Python 3.12
+         :sync: py312
+
+         .. code-block:: shell
+
+            docker pull rocm/megatron-lm:v25.5_py312
+
+      .. tab-item:: Ubuntu 22.04 + Python 3.10
+         :sync: py310
+
+         .. code-block:: shell
+
+            docker pull rocm/megatron-lm:v25.5_py310
+
+2. Launch the Docker container.
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04 + Python 3.12
+         :sync: py312
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py312
+
+
+      .. tab-item:: Ubuntu 22.04 + Python 3.10
+         :sync: py310
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py310
+
+3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start megatron_training_env
+      docker exec -it megatron_training_env bash
+
+The Docker container includes a pre-installed, verified version of the ROCm
+Megatron-LM development branch
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
+training scripts.
+
+.. _amd-megatron-lm-environment-setup-v255:
+
+Configuration
+=============
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b
+
+   Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. note::
+
+   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v255>` for more information on configuration options.
+
+Network interface
+-----------------
+
+Update the network interface in the script to match your system's network interface. To
+find your network interface, run the following (outside of any Docker container):
+
+.. code-block:: bash
+
+   ip a
+
+Look for an active interface that has an IP address in the same subnet as
+your other nodes. Then, update the following variables in the script, for
+example:
+
+.. code-block:: bash
+
+   export NCCL_SOCKET_IFNAME=ens50f0np0
+
+   export GLOO_SOCKET_IFNAME=ens50f0np0
+
+.. _amd-megatron-lm-tokenizer-v255:
+
+Tokenizer
+---------
+
+You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
+If the tokenizer is not found, it'll be downloaded if publicly available.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
+
+   If you do not have Llama 3.3 tokenizer locally, you need to use your
+   personal Hugging Face access token ``HF_TOKEN`` to download the tokenizer.
+   See `Llama-3.3-70B-Instruct
+   <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_. After you are
+   authorized, use your ``HF_TOKEN`` to download the tokenizer and set the
+   variable ``TOKENIZER_MODEL`` to the tokenizer path.
+
+   .. code-block:: shell
+
+      export HF_TOKEN=<Your personal Hugging Face access token>
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.3-70B-Instruct"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.1-8B"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.1-70B"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   The training script uses either the ``Llama2Tokenizer`` or ``HuggingFaceTokenizer`` by default.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3"
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite"
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   Download the Mixtral tokenizer.
+
+   .. code-block:: shell
+
+      mkdir tokenizer
+      cd tokenizer
+      export HF_TOKEN=<Your personal Hugging Face access token>
+      wget --header="Authorization: Bearer $HF_TOKEN" -O ./tokenizer.model https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/resolve/main/tokenizer.model
+
+   Use the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=tokenizer/tokenizer.model
+
+Dataset options
+---------------
+
+You can use either mock data or real data for training.
+
+* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+  value is ``1`` for enabled.
+
+  .. code-block:: bash
+
+     MOCK_DATA=1
+
+* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+  .. code-block:: bash
+
+     MOCK_DATA=0
+
+     DATA_PATH="/data/bookcorpus_text_sentence"  # Change to where your dataset is stored
+
+  Ensure that the files are accessible inside the Docker container.
+
+Download the dataset
+^^^^^^^^^^^^^^^^^^^^
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   For Llama models, use the `prepare_dataset.sh
+   <https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`_ script
+   to prepare your dataset.
+   To download the dataset, set the ``DATASET`` variable to the dataset you'd
+   like to use. Three datasets are supported: ``DATASET=wiki``, ``DATASET=fineweb``, and
+   ``DATASET=bookcorpus``.
+
+   .. code-block:: shell
+
+      DATASET=wiki TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for wiki-en dataset
+      DATASET=bookcorpus TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for bookcorpus dataset
+
+   ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
+   Remember to either pre-download the tokenizer or setup Hugging Face access
+   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v255>` section.
+
+   .. note::
+
+      When training set ``DATA_PATH`` to the specific file name prefix pointing to the ``.bin`` or ``.idx``
+      as in the following example:
+
+      .. code-block:: shell
+
+         DATA_PATH="data/bookcorpus_text_sentence" # Change to where your dataset is stored.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   If you don't already have the dataset, download the DeepSeek dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir deepseek-datasets
+      cd deepseek-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/deepseek-datasets"  # Change to where your dataset is stored
+
+      Ensure that the files are accessible inside the Docker container.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   If you don't already have the dataset, download the DeepSeek dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir deepseek-datasets
+      cd deepseek-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/deepseek-datasets"  # Change to where your dataset is stored
+
+      Ensure that the files are accessible inside the Docker container.
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   If you don't already have the dataset, download the Mixtral dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir mixtral-datasets
+      cd mixtral-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/mixtral-datasets"  # Change to where your dataset is stored
+
+   Ensure that the files are accessible inside the Docker container.
+
+Multi-node configuration
+------------------------
+
+If you're running multi-node training, update the following environment variables. They can
+also be passed as command line arguments. Refer to the following example configurations.
+
+* Change ``localhost`` to the master node's hostname:
+
+  .. code-block:: shell
+
+     MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+  .. code-block:: shell
+
+     NNODES="${NNODES:-1}"
+
+* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+  .. code-block:: shell
+
+     NODE_RANK="${NODE_RANK:-0}"
+
+* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
+  NFS directory) for multi-node runs:
+
+  .. code-block:: shell
+
+     DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
+
+* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
+  inside a Docker container, either install the drivers inside the Docker container or pass the network
+  drivers from the host while creating the Docker container.
+
+  .. code-block:: shell
+
+     # Specify which RDMA interfaces to use for communication
+     export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+
+Getting started
+===============
+
+The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama, DeepSeek, and Mixtral. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+.. _amd-megatron-lm-run-training-v255:
+
+Run training
+------------
+
+Use the following example commands to set up the environment, configure
+:ref:`key options <amd-megatron-lm-benchmark-test-vars-v255>`, and run training on
+MI300X series accelerators with the AMD Megatron-LM environment.
+
+Single node training
+^^^^^^^^^^^^^^^^^^^^
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
+
+   To run the training on a single node for Llama 3.3 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 RECOMPUTE=1 SEQ_LENGTH=8192 MBS=2 BS=16 TE_FP8=0 TP=1 PP=1 FSDP=1 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh 
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+      Currently, FSDP is only compatible with BF16 precision.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
+
+   To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the
+   following command.
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+   For Llama 3.1 8B BF16, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
+
+   To run the training on a single node for Llama 3.1 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+      Currently, FSDP is only compatible with BF16 precision.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b
+
+   To run training on a single node for Llama 2 7B FP8, navigate to the Megatron-LM folder and use the
+   following command.
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+   For Llama 2 7B BF16, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-70b
+
+   To run the training on a single node for Llama 2 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=7 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+      Currently, FSDP is only compatible with BF16 precision.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      FORCE_BANLANCE=true \
+      RUN_ENV=cluster \
+      MODEL_SIZE=671B \
+      TRAIN_ITERS=50 \
+      SEQ_LEN=4096 \
+      NUM_LAYERS=3 \
+      MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=32 \
+      PR=bf16 \
+      TP=1 PP=1 ETP=1 EP=8 \
+      GEMM_TUNING=1 \
+      NVTE_CK_USES_BWD_V3=1 \
+      USE_GROUPED_GEMM=true MOE_USE_LEGACY_GROUPED_GEMM=true \
+      GPT_LAYER_IN_TE=true \
+      bash examples/deepseek_v3/train_deepseekv3.sh
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      RECOMPUTE_NUM_LAYERS=0 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=none PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=4096 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x7B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      RECOMPUTE_NUM_LAYERS=4 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=full NUM_LAYERS=4 PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=8192 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x22B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
+
+Multi-node training
+^^^^^^^^^^^^^^^^^^^
+
+To run training on multiple nodes, launch the Docker container on each node.
+For example, for Llama 3 using a two node setup (``NODE0`` as the master node),
+use these commands.
+
+* On the master node ``NODE0``:
+
+  .. code-block:: shell
+
+     TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8  MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
+
+* On the worker node ``NODE1``:
+
+  .. code-block:: shell
+
+     TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8  MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
+
+Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is
+provided in
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to
+enable training at scale under a SLURM environment. For example, to run
+training on 16 nodes, try the following command:
+
+.. code-block:: shell
+
+   sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
+
+.. _amd-megatron-lm-benchmark-test-vars-v255:
+
+Key options
+-----------
+
+The benchmark tests support the following sets of variables.
+
+``TEE_OUTPUT``
+  ``1`` to enable training logs or ``0`` to disable.
+
+``TE_FP8``
+  ``0`` for B16 or ``1`` for FP8 -- ``0`` by default.
+
+``GEMM_TUNING``
+  ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+``USE_FLASH_ATTN``
+  ``1`` to enable Flash Attention.
+
+``FSDP``
+  ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
+  ``--overlap-param-gather``, and ``--sequence-parallel`` are automatically disabled.
+
+``ENABLE_PROFILING``
+  ``1`` to enable PyTorch profiling for performance analysis.
+
+``transformer-impl``
+  ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
+
+``MODEL_SIZE``
+  ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2, for example.
+
+``TOTAL_ITERS``
+  The total number of iterations -- ``10`` by default.
+
+``MOCK_DATA``
+  ``1`` to use mock data or ``0`` to use real data you provide.
+
+``MBS``
+  Micro batch size.
+
+``BS``
+  Global batch size.
+
+``TP`` / ``TP_SIZE``
+  Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.
+
+``EP`` / ``EP_SIZE``
+  Expert parallel for MoE models.
+
+``SEQ_LENGTH``
+  Input sequence length.
+
+``PR``
+  Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
+
+``AC``
+  Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
+
+``NUM_LAYERS``
+  Use reduced number of layers as a proxy model.
+
+``RECOMPUTE_NUM_LAYERS``
+  Number of layers used for checkpointing recompute.
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -0,0 +1,49 @@
+:orphan:
+
+****************************************************
+PyTorch training performance testing version history
+****************************************************
+
+This table lists previous versions of the ROCm Megatron-LM training Docker image for
+inference performance testing. For detailed information about available models
+for benchmarking, see the version-specific documentation. You can find tagged
+previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/pytorch-training/tags>`_.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Image version
+     - Components
+     - Resources
+
+   * - v25.6
+     - 
+       * ROCm 6.3.4
+       * PyTorch 2.8.0a0+git7d205b2
+     - 
+       * :doc:`Documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
+
+   * - v25.5
+     - 
+       * ROCm 6.3.4
+       * PyTorch 2.7.0a0+git637433
+     - 
+       * :doc:`Documentation <pytorch-training-v25.5>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
+
+   * - v25.4
+     - 
+       * ROCm 6.3.0
+       * PyTorch 2.7.0a0+git637433
+     - 
+       * :doc:`Documentation <pytorch-training-v25.4>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`_
+
+   * - v25.3
+     - 
+       * ROCm 6.3.0
+       * PyTorch 2.7.0a0+git637433
+     - 
+       * :doc:`Documentation <pytorch-training-v25.3>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3.rst
@@ -0,0 +1,353 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm PyTorch
+   training performance documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.3``) image
+provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following
+software components to accelerate training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.11                           |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git258a2162                    |
+--------------------------+--------------------------------+
+| Triton                   | 3.1                            |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* FLUX.1-dev
+
+.. note::
+
+   Only these models are supported in the following steps.
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+If you have already validated your system settings, skip this step. Otherwise,
+complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
+to set up your system before starting training.
+
+Disable NUMA auto-balancing
+---------------------------
+
+Generally, application performance can benefit from disabling NUMA auto-balancing. However,
+it might be detrimental to performance with certain types of workloads.
+
+Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
+Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
+the output is ``1``, run the following command to disable NUMA auto-balancing.
+
+.. code-block:: shell
+
+   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+
+See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
+for more information.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD 
+doesn’t validate configurations and run conditions outside those described.
+
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/pytorch-training:v25.3
+
+2. Run the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.3
+
+3. Use these commands if you exit the ``training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start training_env
+      docker exec -it training_env bash
+
+4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__ repository and navigate to the benchmark scripts directory.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/MAD
+      cd MAD/scripts/pytorch-train
+
+Prepare training datasets and dependencies
+------------------------------------------
+
+The following benchmarking examples may require downloading models and datasets
+from Hugging Face. To ensure successful access to gated repos, set your
+``HF_TOKEN``.
+
+Run the setup script to install libraries and datasets needed for benchmarking.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_setup.sh
+
+``pytorch_benchmark_setup.sh`` installs the following libraries:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Library
+     - Benchmark model
+     - Reference
+
+   * - ``accelerate``
+     - Llama 3.1 8B, FLUX
+     - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+   * - ``datasets``
+     - Llama 3.1 8B, 70B, FLUX
+     - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+   * - ``torchdata``
+     - Llama 3.1 70B
+     - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+   * - ``tomli``
+     - Llama 3.1 70B
+     - `Tomli <https://pypi.org/project/tomli/>`_
+
+   * - ``tiktoken``
+     - Llama 3.1 70B
+     - `tiktoken <https://github.com/openai/tiktoken>`_
+
+   * - ``blobfile``
+     - Llama 3.1 70B
+     - `blobfile <https://pypi.org/project/blobfile/>`_
+
+   * - ``tabulate``
+     - Llama 3.1 70B
+     - `tabulate <https://pypi.org/project/tabulate/>`_
+
+   * - ``wandb``
+     - Llama 3.1 70B
+     - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+   * - ``sentencepiece``
+     - Llama 3.1 70B, FLUX
+     - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+   * - ``tensorboard``
+     - Llama 3.1 70 B, FLUX
+     - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+   * - ``csvkit``
+     - FLUX
+     - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+   * - ``deepspeed``
+     - FLUX
+     - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+   * - ``diffusers``
+     - FLUX
+     - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+   * - ``GitPython``
+     - FLUX
+     - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+   * - ``opencv-python-headless``
+     - FLUX
+     - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+   * - ``peft``
+     - FLUX
+     - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+   * - ``protobuf``
+     - FLUX
+     - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+   * - ``pytest``
+     - FLUX
+     - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+   * - ``python-dotenv``
+     - FLUX
+     - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+   * - ``seaborn``
+     - FLUX
+     - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+   * - ``transformers``
+     - FLUX
+     - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+
+* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+Along with the following datasets:
+
+* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+
+* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+Start training on AMD Instinct accelerators
+===========================================
+
+The prebuilt PyTorch with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama 3.1 and Llama 2. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+Use the following instructions to set up the environment, configure the script
+to train models, and reproduce the benchmark results on MI300X series
+accelerators with the AMD PyTorch training Docker image.
+
+Once your environment is set up, use the following commands and examples to start benchmarking.
+
+Pretraining
+-----------
+
+To start the pretraining benchmark, use the following command with the
+appropriate options. See the following list of options and their descriptions.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+
+Options and available models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$training_mode``
+     - ``pretrain``
+     - Benchmark pretraining
+
+   * -
+     - ``finetune_fw``
+     - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+
+   * -
+     - ``finetune_lora``
+     - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+
+   * - ``$datatype``
+     - FP8 or BF16
+     - Only Llama 3.1 8B supports FP8 precision.
+
+   * - ``$model_repo``
+     - Llama-3.1-8B
+     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+
+   * - 
+     - Llama-3.1-70B
+     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+   * - 
+     - Flux
+     - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+Fine-tuning
+-----------
+
+To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 2 70B
+with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+
+Benchmarking examples
+---------------------
+
+Here are some examples of how to use the command.
+
+* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+
+* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+
+* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+
+* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+
+* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4.rst
@@ -0,0 +1,397 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm PyTorch
+   training performance documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.4``) image
+provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following
+software components to accelerate training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.11                           |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git258a2162                    |
+--------------------------+--------------------------------+
+| Triton                   | 3.1                            |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 2 70B
+
+* FLUX.1-dev
+
+.. note::
+
+   Only these models are supported in the following steps.
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
+before starting training.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD 
+doesn’t validate configurations and run conditions outside those described.
+
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/pytorch-training:v25.4
+
+2. Run the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.4
+
+3. Use these commands if you exit the ``training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start training_env
+      docker exec -it training_env bash
+
+4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+   repository and navigate to the benchmark scripts directory
+   ``/workspace/MAD/scripts/pytorch_train``.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/MAD
+      cd MAD/scripts/pytorch_train
+
+Prepare training datasets and dependencies
+------------------------------------------
+
+The following benchmarking examples require downloading models and datasets
+from Hugging Face. To ensure successful access to gated repos, set your
+``HF_TOKEN``.
+
+.. code-block:: shell
+
+   export HF_TOKEN=$your_personal_hugging_face_access_token
+
+Run the setup script to install libraries and datasets needed for benchmarking.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_setup.sh
+
+``pytorch_benchmark_setup.sh`` installs the following libraries:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Library
+     - Benchmark model
+     - Reference
+
+   * - ``accelerate``
+     - Llama 3.1 8B, FLUX
+     - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+   * - ``datasets``
+     - Llama 3.1 8B, 70B, FLUX
+     - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+   * - ``torchdata``
+     - Llama 3.1 70B
+     - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+   * - ``tomli``
+     - Llama 3.1 70B
+     - `Tomli <https://pypi.org/project/tomli/>`_
+
+   * - ``tiktoken``
+     - Llama 3.1 70B
+     - `tiktoken <https://github.com/openai/tiktoken>`_
+
+   * - ``blobfile``
+     - Llama 3.1 70B
+     - `blobfile <https://pypi.org/project/blobfile/>`_
+
+   * - ``tabulate``
+     - Llama 3.1 70B
+     - `tabulate <https://pypi.org/project/tabulate/>`_
+
+   * - ``wandb``
+     - Llama 3.1 70B
+     - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+   * - ``sentencepiece``
+     - Llama 3.1 70B, FLUX
+     - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+   * - ``tensorboard``
+     - Llama 3.1 70 B, FLUX
+     - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+   * - ``csvkit``
+     - FLUX
+     - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+   * - ``deepspeed``
+     - FLUX
+     - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+   * - ``diffusers``
+     - FLUX
+     - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+   * - ``GitPython``
+     - FLUX
+     - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+   * - ``opencv-python-headless``
+     - FLUX
+     - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+   * - ``peft``
+     - FLUX
+     - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+   * - ``protobuf``
+     - FLUX
+     - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+   * - ``pytest``
+     - FLUX
+     - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+   * - ``python-dotenv``
+     - FLUX
+     - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+   * - ``seaborn``
+     - FLUX
+     - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+   * - ``transformers``
+     - FLUX
+     - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+
+* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+Along with the following datasets:
+
+* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+
+* `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
+
+* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+Getting started
+===============
+
+The prebuilt PyTorch with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama 3.1 and Llama 2. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+Use the following instructions to set up the environment, configure the script
+to train models, and reproduce the benchmark results on MI325X and MI300X
+accelerators with the AMD PyTorch training Docker image.
+
+Once your environment is set up, use the following commands and examples to start benchmarking.
+
+Pretraining
+-----------
+
+To start the pretraining benchmark, use the following command with the
+appropriate options. See the following list of options and their descriptions.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+
+Options and available models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$training_mode``
+     - ``pretrain``
+     - Benchmark pretraining
+
+   * -
+     - ``finetune_fw``
+     - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+
+   * -
+     - ``finetune_lora``
+     - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+
+   * -
+     - ``HF_finetune_lora``
+     - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
+
+   * - ``$datatype``
+     - ``FP8`` or ``BF16``
+     - Only Llama 3.1 8B supports FP8 precision.
+
+   * - ``$model_repo``
+     - ``Llama-3.1-8B``
+     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+
+   * - 
+     - ``Llama-3.1-70B``
+     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+   * - 
+     - ``Llama-2-70B``
+     - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
+
+   * - 
+     - ``Flux``
+     - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+   * - ``$sequence_length``
+     - Sequence length for the language model.
+     - Between 2048 and 8192. 8192 by default.
+
+.. note::
+
+   Occasionally, downloading the Flux dataset might fail. In the event of this
+   error, manually download it from Hugging Face at
+   `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+   and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+   the required dataset.
+
+Fine-tuning
+-----------
+
+To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
+with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+
+Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
+`Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
+Benchmarking examples
+---------------------
+
+Here are some examples of how to use the command.
+
+* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+
+* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+
+* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+
+* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+
+* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+
+* Example 6: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -0,0 +1,439 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
+(``rocm/pytorch-training:v25.5``) image
+provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following
+software components to accelerate training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+25a33da            |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git53b53bf                     |
+--------------------------+--------------------------------+
+| Triton                   | 3.2.0                          |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+* Llama 3.3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 2 70B
+
+* FLUX.1-dev
+
+.. note::
+
+   Only these models are supported in the following steps.
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD 
+doesn’t validate configurations and run conditions outside those described.
+
+Benchmarking
+============
+
+Once the setup is complete, choose between two options to start benchmarking:
+
+.. tab-set::
+
+   .. tab-item:: MAD-integrated benchmarking
+
+      Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+      directory and install the required packages on the host machine.
+
+      .. code-block:: shell
+
+         git clone https://github.com/ROCm/MAD
+         cd MAD
+         pip install -r requirements.txt
+
+      For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
+      using one GPU with the float16 data type on the host machine.
+
+      .. code-block:: shell
+
+         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+         python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+      The available models for MAD-integrated benchmarking are:
+
+      * ``pyt_train_llama-3.3-70b``
+
+      * ``pyt_train_llama-3.1-8b``
+
+      * ``pyt_train_llama-3.1-70b``
+
+      * ``pyt_train_flux``
+
+      MAD launches a Docker container with the name
+      ``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
+      model are collected in the following path: ``~/MAD/perf.csv``.
+
+   .. tab-item:: Standalone benchmarking
+
+      .. rubric:: Download the Docker image and required packages
+
+      Use the following command to pull the Docker image from Docker Hub.
+
+      .. code-block:: shell
+
+         docker pull rocm/pytorch-training:v25.5
+
+      Run the Docker container.
+
+      .. code-block:: shell
+
+         docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
+
+      Use these commands if you exit the ``training_env`` container and need to return to it.
+
+      .. code-block:: shell
+
+         docker start training_env
+         docker exec -it training_env bash
+
+      In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+      repository and navigate to the benchmark scripts directory
+      ``/workspace/MAD/scripts/pytorch_train``.
+
+      .. code-block:: shell
+
+         git clone https://github.com/ROCm/MAD
+         cd MAD/scripts/pytorch_train
+
+      .. rubric:: Prepare training datasets and dependencies
+
+      The following benchmarking examples require downloading models and datasets
+      from Hugging Face. To ensure successful access to gated repos, set your
+      ``HF_TOKEN``.
+
+      .. code-block:: shell
+
+         export HF_TOKEN=$your_personal_hugging_face_access_token
+
+      Run the setup script to install libraries and datasets needed for benchmarking.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_setup.sh
+
+      ``pytorch_benchmark_setup.sh`` installs the following libraries:
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Library
+           - Benchmark model
+           - Reference
+
+         * - ``accelerate``
+           - Llama 3.1 8B, FLUX
+           - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+         * - ``datasets``
+           - Llama 3.1 8B, 70B, FLUX
+           - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+         * - ``torchdata``
+           - Llama 3.1 70B
+           - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+         * - ``tomli``
+           - Llama 3.1 70B
+           - `Tomli <https://pypi.org/project/tomli/>`_
+
+         * - ``tiktoken``
+           - Llama 3.1 70B
+           - `tiktoken <https://github.com/openai/tiktoken>`_
+
+         * - ``blobfile``
+           - Llama 3.1 70B
+           - `blobfile <https://pypi.org/project/blobfile/>`_
+
+         * - ``tabulate``
+           - Llama 3.1 70B
+           - `tabulate <https://pypi.org/project/tabulate/>`_
+
+         * - ``wandb``
+           - Llama 3.1 70B
+           - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+         * - ``sentencepiece``
+           - Llama 3.1 70B, FLUX
+           - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+         * - ``tensorboard``
+           - Llama 3.1 70 B, FLUX
+           - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+         * - ``csvkit``
+           - FLUX
+           - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+         * - ``deepspeed``
+           - FLUX
+           - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+         * - ``diffusers``
+           - FLUX
+           - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+         * - ``GitPython``
+           - FLUX
+           - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+         * - ``opencv-python-headless``
+           - FLUX
+           - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+         * - ``peft``
+           - FLUX
+           - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+         * - ``protobuf``
+           - FLUX
+           - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+         * - ``pytest``
+           - FLUX
+           - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+         * - ``python-dotenv``
+           - FLUX
+           - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+         * - ``seaborn``
+           - FLUX
+           - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+         * - ``transformers``
+           - FLUX
+           - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+      ``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+
+      * `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+      * `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+      Along with the following datasets:
+
+      * `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+
+      * `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
+
+      * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+      .. rubric:: Pretraining
+
+      To start the pretraining benchmark, use the following command with the
+      appropriate options. See the following list of options and their descriptions.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Name
+           - Options
+           - Description
+
+         * - ``$training_mode``
+           - ``pretrain``
+           - Benchmark pretraining
+
+         * -
+           - ``finetune_fw``
+           - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+
+         * -
+           - ``finetune_lora``
+           - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+
+         * -
+           - ``HF_finetune_lora``
+           - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
+
+         * - ``$datatype``
+           - ``FP8`` or ``BF16``
+           - Only Llama 3.1 8B supports FP8 precision.
+
+         * - ``$model_repo``
+           - ``Llama-3.3-70B``
+           - `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
+
+         * - 
+           - ``Llama-3.1-8B``
+           - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+
+         * - 
+           - ``Llama-3.1-70B``
+           - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+         * - 
+           - ``Llama-2-70B``
+           - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
+
+         * - 
+           - ``Flux``
+           - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+         * - ``$sequence_length``
+           - Sequence length for the language model.
+           - Between 2048 and 8192. 8192 by default.
+
+      .. note::
+
+         Occasionally, downloading the Flux dataset might fail. In the event of this
+         error, manually download it from Hugging Face at
+         `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+         and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+         the required dataset.
+
+      .. rubric:: Fine-tuning
+
+      To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
+      with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+
+      Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
+      `Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
+      .. rubric:: Benchmarking examples
+
+      Here are some example commands to get started pretraining and fine-tuning with various model configurations.
+
+      * Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+
+      * Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+
+      * Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+
+      * Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+
+      * Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+
+      * Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
+
+      * Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
+
+      * Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
+
+      * Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -9,28 +9,27 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.

-The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
-(``rocm/pytorch-training:v25.5``) image
-provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following
-software components to accelerate training workloads:
+The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
+(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+training workloads:

 +--------------------------+--------------------------------+
 | Software component       | Version                        |
 +==========================+================================+
 | ROCm                     | 6.3.4                          |
 +--------------------------+--------------------------------+
-| PyTorch                  | 2.7.0a0+git637433              |
+| PyTorch                  | 2.8.0a0+git7d205b2             |
 +--------------------------+--------------------------------+
-| Python                   | 3.10                           |
+| Python                   | 3.10.17                        |
 +--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+25a33da            |
+| Transformer Engine       | 1.14.0+2f85f5f2                |
 +--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0                          |
+| Flash Attention          | 3.0.0.post1                    |
 +--------------------------+--------------------------------+
-| hipBLASLt                | git53b53bf                     |
+| hipBLASLt                | 0.15.0-8c6919d                 |
 +--------------------------+--------------------------------+
-| Triton                   | 3.2.0                          |
+| Triton                   | 3.3.0                          |
 +--------------------------+--------------------------------+

 .. _amd-pytorch-training-model-support:
@@ -40,415 +39,411 @@ Supported models

 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.

-* Llama 3.3 70B
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-* Llama 3.1 8B
+   {% set unified_docker = data.unified_docker.latest %}
+   {% set model_groups = data.model_groups %}

-* Llama 3.1 70B
+   .. raw:: html

-* Llama 2 70B
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>

-* FLUX.1-dev
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>

-.. note::
+   .. note::

-   Only these models are supported in the following steps.
+      Some models require an external license agreement through a third party (for example, Meta).

-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
+   .. _amd-pytorch-training-performance-measurements:

-.. _amd-pytorch-training-performance-measurements:
+   Performance measurements
+   ========================

-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-page provides reference throughput and latency measurements for training
-popular AI models.
-
-.. note::
-
-   The performance data presented in
+   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   should not be interpreted as the peak performance achievable by AMD
-   Instinct MI325X and MI300X accelerators or ROCm software.
+   page provides reference throughput and latency measurements for training
+   popular AI models.

-System validation
-=================
+   .. note::

-If you have already validated your system settings, including NUMA
-auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
-and optimization steps <train-a-model-system-validation>` to set up your system
-before starting training.
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.

-This Docker image is optimized for specific model configurations outlined
-below. Performance can vary for other training workloads, as AMD 
-doesn’t validate configurations and run conditions outside those described.
+   System validation
+   =================

-Benchmarking
-============
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.

-Once the setup is complete, choose between two options to start benchmarking:
+   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+   before starting training.

-.. tab-set::
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.

-   .. tab-item:: MAD-integrated benchmarking
+   This Docker image is optimized for specific model configurations outlined
+   below. Performance can vary for other training workloads, as AMD
+   doesn’t validate configurations and run conditions outside those described.

-      Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-      directory and install the required packages on the host machine.
+   Benchmarking
+   ============

-      .. code-block:: shell
+   Once the setup is complete, choose between two options to start benchmarking:

-         git clone https://github.com/ROCm/MAD
-         cd MAD
-         pip install -r requirements.txt
+   .. tab-set::

-      For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
-      using one GPU with the float16 data type on the host machine.
+      .. tab-item:: MAD-integrated benchmarking

-      .. code-block:: shell
+         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+         directory and install the required packages on the host machine.

-         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-         python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+         .. code-block:: shell

-      The available models for MAD-integrated benchmarking are:
+            git clone https://github.com/ROCm/MAD
+            cd MAD
+            pip install -r requirements.txt

-      * ``pyt_train_llama-3.3-70b``
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}

-      * ``pyt_train_llama-3.1-8b``
+         .. container:: model-doc {{ model.mad_tag }}

-      * ``pyt_train_llama-3.1-70b``
+            For example, use this command to run the performance benchmark test on the {{ model.model }} model
+            using one GPU with the {{ model.precision }} data type on the host machine.

-      * ``pyt_train_flux``
+            .. code-block:: shell

-      MAD launches a Docker container with the name
-      ``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
-      model are collected in the following path: ``~/MAD/perf.csv``.
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               madengine run \
+                   --tags {{ model.mad_tag }} \
+                   --keep-model-dir \
+                   --live-output \
+                   --timeout 28800

-   .. tab-item:: Standalone benchmarking
+            MAD launches a Docker container with the name
+            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv``.

-      .. rubric:: Download the Docker image and required packages
+      {% endfor %}
+   {% endfor %}

-      Use the following command to pull the Docker image from Docker Hub.
+      .. tab-item:: Standalone benchmarking

-      .. code-block:: shell
+         .. rubric:: Download the Docker image and required packages

-         docker pull rocm/pytorch-training:v25.5
+         Use the following command to pull the Docker image from Docker Hub.

-      Run the Docker container.
+         .. code-block:: shell

-      .. code-block:: shell
+            docker pull {{ unified_docker.pull_tag }}

-         docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
+         Run the Docker container.

-      Use these commands if you exit the ``training_env`` container and need to return to it.
+         .. code-block:: shell

-      .. code-block:: shell
+            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}

-         docker start training_env
-         docker exec -it training_env bash
+         Use these commands if you exit the ``training_env`` container and need to return to it.

-      In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-      repository and navigate to the benchmark scripts directory
-      ``/workspace/MAD/scripts/pytorch_train``.
+         .. code-block:: shell

-      .. code-block:: shell
+            docker start training_env
+            docker exec -it training_env bash

-         git clone https://github.com/ROCm/MAD
-         cd MAD/scripts/pytorch_train
+         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+         repository and navigate to the benchmark scripts directory
+         ``/workspace/MAD/scripts/pytorch_train``.

-      .. rubric:: Prepare training datasets and dependencies
+         .. code-block:: shell

-      The following benchmarking examples require downloading models and datasets
-      from Hugging Face. To ensure successful access to gated repos, set your
-      ``HF_TOKEN``.
+            git clone https://github.com/ROCm/MAD
+            cd MAD/scripts/pytorch_train

-      .. code-block:: shell
+         .. rubric:: Prepare training datasets and dependencies

-         export HF_TOKEN=$your_personal_hugging_face_access_token
+         The following benchmarking examples require downloading models and datasets
+         from Hugging Face. To ensure successful access to gated repos, set your
+         ``HF_TOKEN``.

-      Run the setup script to install libraries and datasets needed for benchmarking.
+         .. code-block:: shell

-      .. code-block:: shell
+            export HF_TOKEN=$your_personal_hugging_face_access_token

-         ./pytorch_benchmark_setup.sh
+         Run the setup script to install libraries and datasets needed for benchmarking.

-      ``pytorch_benchmark_setup.sh`` installs the following libraries:
+         .. code-block:: shell

-      .. list-table::
-         :header-rows: 1
+            ./pytorch_benchmark_setup.sh

-         * - Library
-           - Benchmark model
-           - Reference
+         .. container:: model-doc pyt_train_llama-3.1-8b

-         * - ``accelerate``
-           - Llama 3.1 8B, FLUX
-           - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:

-         * - ``datasets``
-           - Llama 3.1 8B, 70B, FLUX
-           - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+            .. list-table::
+               :header-rows: 1

-         * - ``torchdata``
-           - Llama 3.1 70B
-           - `TorchData <https://pytorch.org/data/beta/index.html>`_
+               * - Library
+                 - Reference

-         * - ``tomli``
-           - Llama 3.1 70B
-           - `Tomli <https://pypi.org/project/tomli/>`_
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-         * - ``tiktoken``
-           - Llama 3.1 70B
-           - `tiktoken <https://github.com/openai/tiktoken>`_
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-         * - ``blobfile``
-           - Llama 3.1 70B
-           - `blobfile <https://pypi.org/project/blobfile/>`_
+         .. container:: model-doc pyt_train_llama-3.1-70b

-         * - ``tabulate``
-           - Llama 3.1 70B
-           - `tabulate <https://pypi.org/project/tabulate/>`_
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:

-         * - ``wandb``
-           - Llama 3.1 70B
-           - `Weights & Biases <https://github.com/wandb/wandb>`_
+            .. list-table::
+               :header-rows: 1

-         * - ``sentencepiece``
-           - Llama 3.1 70B, FLUX
-           - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+               * - Library
+                 - Reference

-         * - ``tensorboard``
-           - Llama 3.1 70 B, FLUX
-           - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-         * - ``csvkit``
-           - FLUX
-           - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+               * - ``torchdata``
+                 - `TorchData <https://pytorch.org/data/beta/index.html>`_

-         * - ``deepspeed``
-           - FLUX
-           - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+               * - ``tomli``
+                 - `Tomli <https://pypi.org/project/tomli/>`_

-         * - ``diffusers``
-           - FLUX
-           - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+               * - ``tiktoken``
+                 - `tiktoken <https://github.com/openai/tiktoken>`_

-         * - ``GitPython``
-           - FLUX
-           - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+               * - ``blobfile``
+                 - `blobfile <https://pypi.org/project/blobfile/>`_

-         * - ``opencv-python-headless``
-           - FLUX
-           - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+               * - ``tabulate``
+                 - `tabulate <https://pypi.org/project/tabulate/>`_

-         * - ``peft``
-           - FLUX
-           - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+               * - ``wandb``
+                 - `Weights & Biases <https://github.com/wandb/wandb>`_

-         * - ``protobuf``
-           - FLUX
-           - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-         * - ``pytest``
-           - FLUX
-           - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-         * - ``python-dotenv``
-           - FLUX
-           - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+         .. container:: model-doc pyt_train_flux

-         * - ``seaborn``
-           - FLUX
-           - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:

-         * - ``transformers``
-           - FLUX
-           - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+            .. list-table::
+               :header-rows: 1

-      ``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+               * - Library
+                 - Reference

-      * `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-      * `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-      Along with the following datasets:
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-      * `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-      * `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
+               * - ``csvkit``
+                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1

-      * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+               * - ``deepspeed``
+                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2

-      .. rubric:: Pretraining
+               * - ``diffusers``
+                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0

-      To start the pretraining benchmark, use the following command with the
-      appropriate options. See the following list of options and their descriptions.
+               * - ``GitPython``
+                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44

-      .. code-block:: shell
+               * - ``opencv-python-headless``
+                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84

-         ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+               * - ``peft``
+                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0

-      .. list-table::
-         :header-rows: 1
+               * - ``protobuf``
+                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2

-         * - Name
-           - Options
-           - Description
+               * - ``pytest``
+                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4

-         * - ``$training_mode``
-           - ``pretrain``
-           - Benchmark pretraining
+               * - ``python-dotenv``
+                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1

-         * -
-           - ``finetune_fw``
-           - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+               * - ``seaborn``
+                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2

-         * -
-           - ``finetune_lora``
-           - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+               * - ``transformers``
+                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0

-         * -
-           - ``HF_finetune_lora``
-           - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
+         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:

-         * - ``$datatype``
-           - ``FP8`` or ``BF16``
-           - Only Llama 3.1 8B supports FP8 precision.
+         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_

-         * - ``$model_repo``
-           - ``Llama-3.3-70B``
-           - `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}

-         * - 
-           - ``Llama-3.1-8B``
-           - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+         .. container:: model-doc {{ model.mad_tag }}

-         * - 
-           - ``Llama-3.1-70B``
-           - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+            .. rubric:: Pretraining

-         * - 
-           - ``Llama-2-70B``
-           - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.

-         * - 
-           - ``Flux``
-           - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+            .. code-block:: shell

-         * - ``$sequence_length``
-           - Sequence length for the language model.
-           - Between 2048 and 8192. 8192 by default.
+               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length

-      .. note::
+            .. list-table::
+               :header-rows: 1

-         Occasionally, downloading the Flux dataset might fail. In the event of this
-         error, manually download it from Hugging Face at
-         `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-         and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
-         the required dataset.
+               * - Name
+                 - Options
+                 - Description

-      .. rubric:: Fine-tuning
+            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
+               * - ``$datatype``
+                 - ``BF16`` or ``FP8``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% else %}
+               * - ``$datatype``
+                 - ``BF16``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% endif %}

-      To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
-      with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.

-      .. code-block:: shell
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}

-         ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+               .. note::

-      Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
-      `Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+         {% endif %}

-      .. code-block:: shell
+         {% if model_group.tag == "fine-tuning" %}
+         .. container:: model-doc {{ model.mad_tag }}

-         ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+            .. rubric:: Fine-tuning

-      .. rubric:: Benchmarking examples
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.

-      Here are some example commands to get started pretraining and fine-tuning with various model configurations.
+            .. code-block:: shell

-      * Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length

-        .. code-block:: shell
+            .. list-table::
+               :header-rows: 1

-           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+               * - Name
+                 - Options
+                 - Description

-      * Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+               * - ``$training_mode``
+                 - ``finetune_fw``
+                 - Full weight fine-tuning (BF16 supported)

-        .. code-block:: shell
+               * -
+                 - ``finetune_lora``
+                 - LoRA fine-tuning (BF16 supported)

-           ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+               * -
+                 - ``finetune_qlora``
+                 - QLoRA fine-tuning (BF16 supported)

-      * Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+               * -
+                 - ``HF_finetune_lora``
+                 - LoRA fine-tuning with Hugging Face PEFT

-        .. code-block:: shell
+               * - ``$datatype``
+                 - ``BF16``
+                 - All models support BF16.

-           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.

-      * Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+            .. note::

-        .. code-block:: shell
+               {{ model.model }} currently supports the following fine-tuning methods:

-           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+            {% for method in model.training_modes %}
+               * ``{{ method }}``
+            {% endfor %}
+            {% if model.training_modes|length < 4 %}

-      * Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
+               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
+               does not currently provide YAML configuration files for other combinations of
+               model to fine-tuning method
+               However, you can still configure your own YAML files to enable support for
+               fine-tuning methods not listed here by following existing patterns in the
+               ``/workspace/torchtune/recipes/configs`` directory.
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}

-        .. code-block:: shell
+               .. rubric:: Benchmarking examples

-           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.

-      * Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
+Further reading
+===============

-        .. code-block:: shell
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

-           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

-      * Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
-
-      * Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
-
-      * Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================

-This table lists previous versions of the ROCm PyTorch training Docker image for training
-performance validation. For detailed information about available models for
-benchmarking, see the version-specific documentation.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - Image version
-     - ROCm version
-     - PyTorch version
-     - Resources
-
-   * - v25.4
-     - 6.3.0
-     - 2.7.0a0+git637433
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`_
-
-   * - v25.3
-     - 6.3.0
-     - 2.7.0a0+git637433
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`_
+See :doc:`previous-versions/pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/index.rst
+++ b/docs/how-to/rocm-for-ai/training/index.rst
@@ -21,8 +21,12 @@ In this guide, you'll learn about:

 - Training a model

-  - :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`
+  - :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`

-  - :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`
+  - :doc:`With PyTorch <benchmark-docker/pytorch-training>`
+
+  - :doc:`With JAX MaxText <benchmark-docker/jax-maxtext>`
+
+  - :doc:`With LLM Foundry <benchmark-docker/mpt-llm-foundry>`

 - :doc:`Scaling model training <scale-model-training>`
--- a/docs/how-to/rocm-for-ai/training/prerequisite-system-validation.rst
+++ b/docs/how-to/rocm-for-ai/training/prerequisite-system-validation.rst
@@ -5,12 +5,13 @@
   :keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax

 .. _train-a-model-system-validation:
+.. _rocm-for-ai-system-optimization:

-**********************************************
-Prerequisite system validation before training
-**********************************************
+**********************************************************
+Prerequisite system validation before running AI workloads
+**********************************************************

-Complete the following system validation and optimization steps to set up your system before starting training.
+Complete the following system validation and optimization steps to set up your system before starting training and inference.

 Disable NUMA auto-balancing
 ---------------------------
@@ -26,7 +27,8 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.

   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'

-See :ref:`mi300x-disable-numa` for more information.
+See `Disable NUMA auto-balancing <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_
+in the Instinct documentation for more information.

 Hardware verification with ROCm
 -------------------------------
@@ -42,7 +44,8 @@ Run the command:

   rocm-smi --setperfdeterminism 1900

-See :ref:`mi300x-hardware-verification-with-rocm` for more information.
+See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
+in the Instinct documentation for more information.

 RCCL Bandwidth Test for multi-node setups
 -----------------------------------------
--- a/docs/how-to/rocm-for-hpc/index.rst
+++ b/docs/how-to/rocm-for-hpc/index.rst
@@ -76,14 +76,6 @@ Ubuntu versions.
          single node workstations, multi and many-core nodes, clusters of nodes via
          QMP, and classic vector computers.

-      * -
-        - `Grid <https://github.com/amd/InfinityHub-CI/tree/main/grid/>`_
-        - Grid is a library for lattice QCD calculations that employs a high-level data parallel
-          approach while using a number of techniques to target multiple types of parallelism.
-          The library currently supports MPI, OpenMP and short vector parallelism. The SIMD
-          instructions sets covered include SSE, AVX, AVX2, FMA4, IMCI and AVX512. Recent
-          releases expanded this support to include GPU offloading.
-
      * -
        - `MILC <https://github.com/amd/InfinityHub-CI/tree/main/milc/>`_
        - The MILC Code is a set of research codes developed by MIMD Lattice Computation
@@ -237,12 +229,18 @@ Ubuntu versions.
          of these applications.

      * - Tools and libraries
-        - `ROCm with GPU-aware MPI container <https://github.com/amd/InfinityHub-CI/tree/main/base-gpu-mpi-rocm-docker>`_
+        - `AMD ROCm with OpenMPI container <https://github.com/amd/InfinityHub-CI/blob/main/base-gpu-mpi-rocm-docker>`_
        - Base container for GPU-aware MPI with ROCm for HPC applications. This
          project provides a boilerplate for building and running a Docker
          container with ROCm supporting GPU-aware MPI implementations using
          OpenMPI or UCX.
-
+      
+      * - 
+        - `AMD ROCm with MPICH container <https://github.com/amd/InfinityHub-CI/blob/main/base-mpich-rocm-docker>`_
+        - Base container for GPU-aware MPI with ROCm for HPC applications. This
+          project provides a boilerplate for building and running a Docker
+          container with ROCm supporting GPU-aware MPI implementations using MPICH.
+      
      * -
        - `Kokkos <https://github.com/amd/InfinityHub-CI/tree/main/kokkos>`_
        - Kokkos is a programming model in C++ for writing performance portable
--- a/docs/how-to/setting-cus.rst
+++ b/docs/how-to/setting-cus.rst
@@ -38,5 +38,5 @@ The variable parsing stops when a syntax error occurs. The erroneous set and the

    These environment variables only affect ROCm software, not graphics applications.

-Not all CU configurations are valid on all devices. For example, on devices where two CUs can be combined into a WGP (for kernels running in WGP mode), it’s not valid to disable only a single CU in a WGP. For more information about what to expect when disabling CUs, see the `Exploring AMD GPU Scheduling Details by Experimenting With “Worst Practices” <https://www.cs.unc.edu/~otternes/papers/rtsj2022.pdf>`_ paper.
+Not all CU configurations are valid on all devices. For example, on devices where two CUs can be combined into a WGP (for kernels running in WGP mode), it’s not valid to disable only a single CU in a WGP.

--- a/docs/how-to/tuning-guides/mi300x/index.rst
+++ b/docs/how-to/tuning-guides/mi300x/index.rst
@@ -12,8 +12,7 @@ accelerators. They include detailed instructions on system settings and
 application tuning suggestions to help you fully leverage the capabilities of
 these accelerators, thereby achieving optimal performance.

-* :doc:`../../rocm-for-ai/inference/vllm-benchmark`
-* :doc:`../../rocm-for-ai/inference-optimization/workload`
+* :doc:`/how-to/rocm-for-ai/inference-optimization/workload`
 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_


--- a/docs/reference/api-libraries.md
+++ b/docs/reference/api-libraries.md
@@ -45,7 +45,7 @@
 (communication-libraries)=

 * {doc}`RCCL <rccl:index>`
-* [rocSHMEM](https://github.com/ROCm/rocSHMEM)
+* {doc}`rocSHMEM <rocshmem:index>`
 :::

 :::{grid-item-card} Math
--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -281,13 +281,31 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - SGPR File (KiB)
          - GFXIP Major version
          - GFXIP Minor version
+        *
+          - Radeon AI PRO R9700
+          - RDNA4
+          - gfx1201
+          - 32
+          - 64
+          - 32 or 64
+          - 128
+          - 64
+          - 8
+          - N/A
+          - 32
+          - 16
+          - 32
+          - 768
+          - 32
+          - 12
+          - 0
        *
          - Radeon PRO V710
          - RDNA3
          - gfx1101
          - 28
          - 54
-          - 32
+          - 32 or 64
          - 128
          - 56
          - 4
@@ -296,7 +314,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -305,7 +323,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 48
          - 96
-          - 32
+          - 32 or 64
          - 128
          - 96
          - 6
@@ -314,7 +332,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -323,7 +341,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 48
          - 96
-          - 32
+          - 32 or 64
          - 128
          - 96
          - 6
@@ -332,7 +350,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -341,7 +359,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 48
          - 70
-          - 32
+          - 32 or 64
          - 128
          - 96
          - 6
@@ -350,7 +368,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -359,7 +377,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 32
          - 70
-          - 32
+          - 32 or 64
          - 128
          - 64
          - 6
@@ -368,7 +386,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -377,7 +395,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1101
          - 16
          - 48
-          - 32
+          - 32 or 64
          - 128
          - 64
          - 4
@@ -386,7 +404,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -395,7 +413,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 32
          - 60
-          - 32
+          - 32 or 64
          - 128
          - 128
          - 4
@@ -404,7 +422,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -413,7 +431,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1032
          - 8
          - 28
-          - 32
+          - 32 or 64
          - 128
          - 32
          - 2
@@ -422,7 +440,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -431,7 +449,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 32
          - 72
-          - 32
+          - 32 or 64
          - 128
          - 128
          - 4
@@ -440,7 +458,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -449,7 +467,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1012
          - 8
          - 22
-          - 32
+          - 32 or 64
          - 128
          -
          - 4
@@ -504,13 +522,85 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - SGPR File (KiB)
          - GFXIP Major version
          - GFXIP Minor version
+        *
+          - Radeon RX 9070 XT
+          - RDNA4
+          - gfx1201
+          - 16
+          - 64
+          - 32 or 64
+          - 128
+          - 64
+          - 8
+          - N/A
+          - 32
+          - 16
+          - 32
+          - 768
+          - 32
+          - 12
+          - 0
+        *
+          - Radeon RX 9070 GRE
+          - RDNA4
+          - gfx1201
+          - 16
+          - 48
+          - 32 or 64
+          - 128
+          - 48
+          - 6
+          - N/A
+          - 32
+          - 16
+          - 32
+          - 768
+          - 32
+          - 12
+          - 0
+        *
+          - Radeon RX 9070
+          - RDNA4
+          - gfx1201
+          - 16
+          - 56
+          - 32 or 64
+          - 128
+          - 64
+          - 8
+          - N/A
+          - 32
+          - 16
+          - 32
+          - 768
+          - 32
+          - 12
+          - 0
+        *
+          - Radeon RX 9060 XT
+          - RDNA4
+          - gfx1200
+          - 16
+          - 32
+          - 32 or 64
+          - 128
+          - 32
+          - 4
+          - N/A
+          - 32
+          - 16
+          - 32
+          - 768
+          - 32
+          - 12
+          - 0
        *
          - Radeon RX 7900 XTX
          - RDNA3
          - gfx1100
          - 24
          - 96
-          - 32
+          - 32 or 64
          - 128
          - 96
          - 6
@@ -519,7 +609,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -528,7 +618,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 20
          - 84
-          - 32
+          - 32 or 64
          - 128
          - 80
          - 6
@@ -537,7 +627,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -546,7 +636,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 16
          - 80
-          - 32
+          - 32 or 64
          - 128
          - 64
          - 6
@@ -555,7 +645,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -564,7 +654,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1101
          - 16
          - 60
-          - 32
+          - 32 or 64
          - 128
          - 64
          - 4
@@ -573,7 +663,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -582,7 +672,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1101
          - 12
          - 54
-          - 32
+          - 32 or 64
          - 128
          - 48
          - 4
@@ -591,7 +681,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -600,7 +690,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1102
          - 8
          - 32
-          - 32
+          - 32 or 64
          - 128
          - 32
          - 2
@@ -609,7 +699,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 11
          - 0
        *
@@ -618,7 +708,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 16
          - 80
-          - 32
+          - 32 or 64
          - 128
          - 128
          - 4
@@ -627,7 +717,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -636,7 +726,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 16
          - 80
-          - 32
+          - 32 or 64
          - 128
          - 128
          - 4
@@ -645,7 +735,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -654,7 +744,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 16
          - 72
-          - 32
+          - 32 or 64
          - 128
          - 128
          - 4
@@ -663,7 +753,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -672,7 +762,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 16
          - 60
-          - 32
+          - 32 or 64
          - 128
          - 128
          - 4
@@ -681,7 +771,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -690,7 +780,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1031
          - 12
          - 40
-          - 32
+          - 32 or 64
          - 128
          - 96
          - 3
@@ -699,7 +789,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -708,7 +798,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1031
          - 12
          - 40
-          - 32
+          - 32 or 64
          - 128
          - 96
          - 3
@@ -717,7 +807,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -726,7 +816,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1031
          - 10
          - 36
-          - 32
+          - 32 or 64
          - 128
          - 80
          - 3
@@ -735,7 +825,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -744,7 +834,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1032
          - 8
          - 32
-          - 32
+          - 32 or 64
          - 128
          - 32
          - 2
@@ -753,7 +843,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -762,7 +852,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1032
          - 8
          - 32
-          - 32
+          - 32 or 64
          - 128
          - 32
          - 2
@@ -771,7 +861,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
@@ -780,7 +870,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1032
          - 8
          - 28
-          - 32
+          - 32 or 64
          - 128
          - 32
          - 2
@@ -789,7 +879,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 16
+          - 32
          - 10
          - 3
        *
--- a/docs/release/versions.md
+++ b/docs/release/versions.md
@@ -10,6 +10,7 @@

 | Version | Release date |
 | ------- | ------------ |
+| [6.4.1](https://rocm.docs.amd.com/en/docs-6.4.1/) | May 21, 2025 |
 | [6.4.0](https://rocm.docs.amd.com/en/docs-6.4.0/) | April 11, 2025 |
 | [6.3.3](https://rocm.docs.amd.com/en/docs-6.3.3/) | February 19, 2025 |
 | [6.3.2](https://rocm.docs.amd.com/en/docs-6.3.2/) | January 28, 2025 |
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -36,15 +36,19 @@ subtrees:
    title: Use ROCm for AI
    subtrees:
    - entries:
+      - file: how-to/rocm-for-ai/install.rst
+        title: Installation
+      - file: how-to/rocm-for-ai/system-health-check.rst
+        title: System health benchmarks
      - file: how-to/rocm-for-ai/training/index.rst
        title: Training
        subtrees:
        - entries:
-          - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm
+          - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
            title: Train a model with Megatron-LM
-          - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training
+          - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
            title: Train a model with PyTorch
-          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext
+          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
            title: Train a model with JAX MaxText
          - file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
            title: Train a model with LLM Foundry
@@ -70,15 +74,13 @@ subtrees:
        title: Inference
        subtrees:
        - entries:
-          - file: how-to/rocm-for-ai/inference/install.rst
-            title: Installation
          - file: how-to/rocm-for-ai/inference/hugging-face-models.rst
            title: Run models from Hugging Face
          - file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
            title: LLM inference frameworks
-          - file: how-to/rocm-for-ai/inference/vllm-benchmark.rst
+          - file: how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
            title: vLLM inference performance testing
-          - file: how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst
+          - file: how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
            title: PyTorch inference performance testing
          - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
            title: Deploy your model
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,4 +1,4 @@
-rocm-docs-core==1.18.2
+rocm-docs-core==1.23.0
 sphinx-reredirects
 sphinx-sitemap
-sphinxcontrib.datatemplates==0.11.0
+sphinxcontrib.datatemplates==0.11.0
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile docs/sphinx/requirements.in
+#    pip-compile /mnt/nonstandard/ROCm/requirements.in
 #
 accessible-pygments==0.0.5
    # via pydata-sphinx-theme
@@ -10,74 +10,71 @@ alabaster==1.0.0
    # via sphinx
 asttokens==3.0.0
    # via stack-data
-attrs==25.1.0
+attrs==25.3.0
    # via
    #   jsonschema
    #   jupyter-cache
    #   referencing
-babel==2.16.0
+babel==2.17.0
    # via
    #   pydata-sphinx-theme
    #   sphinx
-beautifulsoup4==4.12.3
+beautifulsoup4==4.13.5
    # via pydata-sphinx-theme
-breathe==4.35.0
+breathe==4.36.0
    # via rocm-docs-core
-certifi==2024.8.30
+certifi==2025.8.3
    # via requests
-cffi==1.17.1
+cffi==2.0.0
    # via
    #   cryptography
    #   pynacl
-charset-normalizer==3.4.0
+charset-normalizer==3.4.3
    # via requests
-click==8.1.7
+click==8.2.1
    # via
    #   jupyter-cache
    #   sphinx-external-toc
-comm==0.2.2
+comm==0.2.3
    # via ipykernel
-cryptography==44.0.1
+cryptography==45.0.7
    # via pyjwt
-debugpy==1.8.12
+debugpy==1.8.16
    # via ipykernel
-decorator==5.1.1
+decorator==5.2.1
    # via ipython
 defusedxml==0.7.1
    # via sphinxcontrib-datatemplates
-deprecated==1.2.15
-    # via pygithub
 docutils==0.21.2
    # via
-    #   breathe
    #   myst-parser
    #   pydata-sphinx-theme
    #   sphinx
-exceptiongroup==1.2.2
+exceptiongroup==1.3.0
    # via ipython
-executing==2.2.0
+executing==2.2.1
    # via stack-data
-fastjsonschema==2.20.0
+fastjsonschema==2.21.2
    # via
    #   nbformat
    #   rocm-docs-core
-gitdb==4.0.11
+gitdb==4.0.12
    # via gitpython
-gitpython==3.1.43
+gitpython==3.1.45
    # via rocm-docs-core
-greenlet==3.1.1
+greenlet==3.2.4
    # via sqlalchemy
 idna==3.10
    # via requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==8.6.1
+importlib-metadata==8.7.0
    # via
    #   jupyter-cache
    #   myst-nb
-ipykernel==6.29.5
+ipykernel==6.30.1
    # via myst-nb
-ipython==8.31.0
+ipython==8.37.0
    # via
    #   ipykernel
    #   myst-nb
@@ -87,9 +84,9 @@ jinja2==3.1.6
    # via
    #   myst-parser
    #   sphinx
-jsonschema==4.23.0
+jsonschema==4.25.1
    # via nbformat
-jsonschema-specifications==2024.10.1
+jsonschema-specifications==2025.9.1
    # via jsonschema
 jupyter-cache==1.0.1
    # via myst-nb
@@ -97,7 +94,7 @@ jupyter-client==8.6.3
    # via
    #   ipykernel
    #   nbclient
-jupyter-core==5.7.2
+jupyter-core==5.8.1
    # via
    #   ipykernel
    #   jupyter-client
@@ -113,13 +110,13 @@ matplotlib-inline==0.1.7
    # via
    #   ipykernel
    #   ipython
-mdit-py-plugins==0.4.2
+mdit-py-plugins==0.5.0
    # via myst-parser
 mdurl==0.1.2
    # via markdown-it-py
-myst-nb==1.1.2
+myst-nb==1.3.0
    # via rocm-docs-core
-myst-parser==4.0.0
+myst-parser==4.0.1
    # via myst-nb
 nbclient==0.10.2
    # via
@@ -132,41 +129,41 @@ nbformat==5.10.4
    #   nbclient
 nest-asyncio==1.6.0
    # via ipykernel
-packaging==24.2
+packaging==25.0
    # via
    #   ipykernel
    #   sphinx
-parso==0.8.4
+parso==0.8.5
    # via jedi
 pexpect==4.9.0
    # via ipython
-platformdirs==4.3.6
+platformdirs==4.4.0
    # via jupyter-core
-prompt-toolkit==3.0.50
+prompt-toolkit==3.0.52
    # via ipython
-psutil==6.1.1
+psutil==7.0.0
    # via ipykernel
 ptyprocess==0.7.0
    # via pexpect
 pure-eval==0.2.3
    # via stack-data
-pycparser==2.22
+pycparser==2.23
    # via cffi
-pydata-sphinx-theme==0.16.0
+pydata-sphinx-theme==0.16.1
    # via
    #   rocm-docs-core
    #   sphinx-book-theme
-pygithub==2.5.0
+pygithub==2.8.1
    # via rocm-docs-core
-pygments==2.18.0
+pygments==2.19.2
    # via
    #   accessible-pygments
    #   ipython
    #   pydata-sphinx-theme
    #   sphinx
-pyjwt[crypto]==2.10.0
+pyjwt[crypto]==2.10.1
    # via pygithub
-pynacl==1.5.0
+pynacl==1.6.0
    # via pygithub
 python-dateutil==2.9.0.post0
    # via jupyter-client
@@ -178,7 +175,7 @@ pyyaml==6.0.2
    #   rocm-docs-core
    #   sphinx-external-toc
    #   sphinxcontrib-datatemplates
-pyzmq==26.2.0
+pyzmq==27.1.0
    # via
    #   ipykernel
    #   jupyter-client
@@ -186,23 +183,23 @@ referencing==0.36.2
    # via
    #   jsonschema
    #   jsonschema-specifications
-requests==2.32.3
+requests==2.32.5
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.18.2
-    # via -r requirements.in
-rpds-py==0.22.3
+rocm-docs-core==1.23.0
+    # via -r /mnt/nonstandard/ROCm/requirements.in
+rpds-py==0.27.1
    # via
    #   jsonschema
    #   referencing
 six==1.17.0
    # via python-dateutil
-smmap==5.0.1
+smmap==5.0.2
    # via gitdb
-snowballstemmer==2.2.0
+snowballstemmer==3.0.1
    # via sphinx
-soupsieve==2.6
+soupsieve==2.8
    # via beautifulsoup4
 sphinx==8.1.3
    # via
@@ -215,9 +212,9 @@ sphinx==8.1.3
    #   sphinx-copybutton
    #   sphinx-design
    #   sphinx-external-toc
+    #   sphinx-last-updated-by-git
    #   sphinx-notfound-page
    #   sphinx-reredirects
-    #   sphinx-sitemap
    #   sphinxcontrib-datatemplates
    #   sphinxcontrib-runcmd
 sphinx-book-theme==1.1.3
@@ -228,16 +225,18 @@ sphinx-design==0.6.1
    # via rocm-docs-core
 sphinx-external-toc==1.0.1
    # via rocm-docs-core
-sphinx-notfound-page==1.0.4
+sphinx-last-updated-by-git==0.3.8
+    # via sphinx-sitemap
+sphinx-notfound-page==1.1.0
    # via rocm-docs-core
 sphinx-reredirects==0.1.6
-    # via -r requirements.in
-sphinx-sitemap==2.6.0
-    # via -r requirements.in
+    # via -r /mnt/nonstandard/ROCm/requirements.in
+sphinx-sitemap==2.8.0
+    # via -r /mnt/nonstandard/ROCm/requirements.in
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
 sphinxcontrib-datatemplates==0.11.0
-    # via -r requirements.in
+    # via -r /mnt/nonstandard/ROCm/requirements.in
 sphinxcontrib-devhelp==2.0.0
    # via sphinx
 sphinxcontrib-htmlhelp==2.1.0
@@ -250,21 +249,20 @@ sphinxcontrib-runcmd==0.2.0
    # via sphinxcontrib-datatemplates
 sphinxcontrib-serializinghtml==2.0.0
    # via sphinx
-sqlalchemy==2.0.37
+sqlalchemy==2.0.43
    # via jupyter-cache
 stack-data==0.6.3
    # via ipython
 tabulate==0.9.0
    # via jupyter-cache
-tomli==2.1.0
+tomli==2.2.1
    # via sphinx
-tornado==6.4.2
+tornado==6.5.2
    # via
    #   ipykernel
    #   jupyter-client
 traitlets==5.14.3
    # via
-    #   comm
    #   ipykernel
    #   ipython
    #   jupyter-client
@@ -272,21 +270,21 @@ traitlets==5.14.3
    #   matplotlib-inline
    #   nbclient
    #   nbformat
-typing-extensions==4.12.2
+typing-extensions==4.15.0
    # via
+    #   beautifulsoup4
+    #   exceptiongroup
    #   ipython
    #   myst-nb
    #   pydata-sphinx-theme
    #   pygithub
    #   referencing
    #   sqlalchemy
-urllib3==2.2.3
+urllib3==2.5.0
    # via
    #   pygithub
    #   requests
 wcwidth==0.2.13
    # via prompt-toolkit
-wrapt==1.17.0
-    # via deprecated
-zipp==3.21.0
-    # via importlib-metadata
+zipp==3.23.0
+    # via importlib-metadata
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -52,7 +52,7 @@ Communication
  :header: "Component", "Description"

  ":doc:`RCCL <rccl:index>`", "Standalone library that provides multi-GPU and multi-node collective communication primitives"
-  "`rocSHMEM <https://github.com/ROCm/rocSHMEM>`_", "Runtime that provides GPU-centric networking through an OpenSHMEM-like interface. This intra-kernel networking library simplifies application code complexity and enables more fine-grained communication/computation overlap than traditional host-driven networking"
+  ":doc:`rocSHMEM <rocshmem:index>`", "An intra-kernel networking library that provides GPU-centric networking through an OpenSHMEM-like interface"

 Math
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -98,7 +98,7 @@ System Management
 .. csv-table::
  :header: "Component", "Description"

-  ":doc:`AMD SMI <amdsmi:index>`", "C library for Linux that provides a user space interface for applications to monitor and control AMD devices"
+  ":doc:`AMD SMI <amdsmi:index>`", "System management interface to control AMD GPU settings, monitor performance, and retrieve device and process information"
  ":doc:`ROCm Data Center Tool <rdc:index>`", "Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments"
  ":doc:`rocminfo <rocminfo:index>`", "Reports system information"
  ":doc:`ROCm SMI <rocm_smi_lib:index>`", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
@@ -117,6 +117,11 @@ Performance
  ":doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`", "Toolkit for developing analysis tools for profiling and tracing GPU compute applications. This toolkit is in beta and subject to change"
  ":doc:`ROCTracer <roctracer:index>`", "Intercepts runtime API calls and traces asynchronous activity"

+.. note::
+
+  `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`.
+  Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
+
 Development
 ^^^^^^^^^^^

--- a/tools/autotag/components.xml
+++ b/tools/autotag/components.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.0"
+    <default revision="refs/tags/rocm-6.4.1"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
--- a/tools/rocm-build/ROCm.mk
+++ b/tools/rocm-build/ROCm.mk
@@ -87,7 +87,6 @@ endef

 $(call adddep,amd_smi_lib,${ASAN_DEP})
 $(call adddep,aqlprofile,${ASAN_DEP} rocr)
-$(call adddep,aqlprofiletest,lightning rocminfo aqlprofile opencl_on_rocclr hip_on_rocclr)
 $(call adddep,comgr,lightning devicelibs)
 $(call adddep,dbgapi,rocr comgr)
 $(call adddep,devicelibs,lightning)
@@ -115,7 +114,7 @@ $(call adddep,roctracer,${ASAN_DEP} rocr hip_on_rocclr)


 # rocm-dev points to all possible last finish components of Stage1 build.
-rocm-dev-components :=amd_smi_lib aqlprofile aqlprofiletest comgr dbgapi devicelibs hip_on_rocclr hipcc hipify_clang \
+rocm-dev-components :=amd_smi_lib aqlprofile comgr dbgapi devicelibs hip_on_rocclr hipcc hipify_clang \
 	lightning rocprofiler-compute opencl_on_rocclr openmp_extras rocm_bandwidth_test rocm_smi_lib \
 	rocm-cmake rocm-core rocm-gdb rocminfo rocprofiler-register rocprofiler-sdk rocprofiler-systems \
 	rocprofiler rocr rocr_debug_agent rocrsamples roctracer
--- a/tools/rocm-build/build_rocr.sh
+++ b/tools/rocm-build/build_rocr.sh
@@ -255,8 +255,8 @@ print_output_directory() {
 # Common variables
 target="build"

-kfdtest_target="yes"
-rocrtst_target="yes"
+kfdtest_target="no"
+rocrtst_target="no"
 rocr_target="ON"

 package_root="$(getPackageRoot)"
--- a/tools/rocm-build/docker/ubuntu22/install-prerequisities.sh
+++ b/tools/rocm-build/docker/ubuntu22/install-prerequisities.sh
--- a/tools/rocm-build/docker/ubuntu22/packages
+++ b/tools/rocm-build/docker/ubuntu22/packages
@@ -60,7 +60,6 @@ libfile-find-rule-perl
 libgflags-dev
 libglew-dev
 libgmp-dev
-libgoogle-glog-dev
 libgtk2.0-dev
 libhdf5-serial-dev
 libjpeg-dev
@@ -90,7 +89,6 @@ libsuitesparse-dev
 libsystemd-dev
 libtinfo-dev
 libtool
-libunwind-dev
 liburi-encode-perl
 libva-dev
 libvirt-clients
@@ -98,7 +96,6 @@ libvirt-daemon-system
 libyaml-cpp-dev
 libzstd-dev
 llvm
-llvm-6.0-dev
 llvm-dev
 llvm-runtime
 mesa-common-dev
@@ -112,8 +109,7 @@ pigz
 pkg-config
 protobuf-compiler
 python-is-python3
-python-pip-whl
-python-yaml
+python3-pip-whl
 python3-dev
 python3-pip
 python3-venv
--- a/tools/rocm-build/docker/ubuntu24/install-prerequisites.sh
+++ b/tools/rocm-build/docker/ubuntu24/install-prerequisites.sh
@@ -17,7 +17,7 @@ git --version

 # venv for python to be able to run pip3 without --break-system-packages
 python3 -m venv /opt/venv
-
+source /opt/venv/bin/activate
 pip3 install CppHeaderParser argparse lxml recommonmark jinja2==3.0.0 \
    websockets matplotlib numpy scipy minimal msgpack pytest sphinx joblib PyYAML rocm-docs-core cmake==3.25.2 pandas \
    myst-parser setuptools lit
--- a/tools/rocm-build/envsetup.sh
+++ b/tools/rocm-build/envsetup.sh
@@ -217,7 +217,7 @@ export RCCL_ROOT=$WORK_ROOT/rccl
 export ROCM_DBGAPI_ROOT=$WORK_ROOT/ROCdbgapi
 export ROCM_GDB_ROOT=$WORK_ROOT/ROCgdb
 # export ROCclr_ROOT=$WORK_ROOT/vdi
-export HIP_ON_ROCclr_ROOT=$WORK_ROOT/HIP
+export HIP_ON_ROCclr_ROOT=$WORK_ROOT/hip
 export HIPAMD_ROOT=$WORK_ROOT/hipamd
 export HIP_CATCH_TESTS_ROOT=$WORK_ROOT/hip-tests
 # export OPENCL_ON_ROCclr_ROOT=$WORK_ROOT/opencl-on-vdi
--- a/Show More
+++ b/Show More