update group name

add Falcon to vllm-benchmark-models.yaml
2026-01-11 07:38:17 -05:00 · 2025-05-29 10:41:53 -04:00 · 2025-05-08 14:13:03 -04:00
68 changed files with 1844 additions and 2476 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -77,8 +77,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      componentName: clr
-      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+      cmakeBuildDir: 'clr/build'
      extraBuildFlags: >-
        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
        -DHIP_PLATFORM=amd
@@ -139,8 +138,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      componentName: clr
-      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+      cmakeBuildDir: 'clr/build'
      extraBuildFlags: >-
        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
        -DHIP_PLATFORM=nvidia
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -73,7 +73,6 @@ jobs:
    parameters:
      componentName: upstream-llvm
      cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
-      cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
      installDir: $(Pipeline.Workspace)/llvm
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
--- a/.azuredevops/components/ROCgdb.yml
+++ b/.azuredevops/components/ROCgdb.yml
@@ -15,7 +15,6 @@ parameters:
  type: object
  default:
    - bison
-    - cmake
    - dejagnu
    - flex
    - libbabeltrace-dev
@@ -40,69 +39,17 @@ parameters:
 - name: jobMatrix
  type: object
  default:
-    testJobs:
+    buildTestJobs:
      - gfx942:
        target: gfx942
      - gfx90a:
        target: gfx90a

 jobs:
- job: ROCgdb
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: PKG_CONFIG_PATH
-    value: $(Agent.BuildDirectory)/rocm/share/pkgconfig
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
-    parameters:
-      configureFlags: >-
-        --program-prefix=roc
-        --enable-64-bit-bfd
-        --enable-targets="x86_64-linux-gnu,amdgcn-amd-amdhsa"
-        --disable-ld
-        --disable-gas
-        --disable-gdbserver
-        --disable-sim
-        --enable-tui
-        --disable-gdbtk
-        --disable-shared
-        --disable-gprofng
-        --with-expat
-        --with-system-zlib
-        --without-guile
-        --with-babeltrace
-        --with-lzma
-        --with-python=python3
-        --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
-        LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
-      makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-
- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ROCgdb_test_${{ job.target }}
-    dependsOn: ROCgdb
+- ${{ each job in parameters.jobMatrix.buildTestJobs }}:
+  - job: ROCgdb_build_test_${{ job.target }}
    condition:
-      and(succeeded(),
+      and(
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
        eq(${{ parameters.aggregatePipeline }}, False)
@@ -152,6 +99,8 @@ jobs:
          --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
          LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
        makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
    - task: Bash@3
      displayName: Setup test environment
      inputs:
@@ -160,6 +109,7 @@ jobs:
          # Assuming that /opt is no longer persistent across runs, test environments are fully ephemeral
          sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
          echo "##vso[task.prependpath]/opt/rocm/bin"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - task: Bash@3
      displayName: check-gdb
--- a/.azuredevops/components/ROCmValidationSuite.yml
+++ b/.azuredevops/components/ROCmValidationSuite.yml
@@ -27,7 +27,6 @@ parameters:
  type: object
  default:
    - amdsmi
-    - aomp
    - clr
    - hipBLAS-common
    - hipBLASLt
@@ -44,7 +43,6 @@ parameters:
  type: object
  default:
    - amdsmi
-    - aomp
    - clr
    - hipBLAS-common
    - hipBLASLt
@@ -110,7 +108,6 @@ jobs:
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/llvm/include
          -DCPACK_PACKAGING_INSTALL_PREFIX=$(Build.BinariesDirectory)
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
--- a/.azuredevops/components/aomp.yml
+++ b/.azuredevops/components/aomp.yml
@@ -118,7 +118,6 @@ jobs:
    parameters:
      componentName: extras
      cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
      installDir: '$(Build.BinariesDirectory)/llvm'
      extraBuildFlags: >-
        -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
@@ -130,7 +129,6 @@ jobs:
    parameters:
      componentName: openmp
      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
      installDir: '$(Build.BinariesDirectory)/llvm'
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
@@ -157,7 +155,6 @@ jobs:
    parameters:
      componentName: offload
      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
      installDir: '$(Build.BinariesDirectory)/llvm'
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
--- a/.azuredevops/components/copyHIP.yml
+++ b/.azuredevops/components/copyHIP.yml
@@ -26,11 +26,9 @@ jobs:
    parameters:
      componentName: HIP
      pipelineId: $(HIP_PIPELINE_ID)
-  - task: Bash@3
-    displayName: Copy HIP artifacts
-    inputs:
-      targetType: inline
-      script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
+  - template:  ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
+    parameters:
+      sourceDir: $(Agent.BuildDirectory)/rocm
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -92,8 +92,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: external
-        cmakeBuildDir: '$(Build.SourcesDirectory)/deps/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/deps'
+        cmakeBuildDir: 'deps/build'
        installDir: '$(Pipeline.Workspace)/deps-install'
        extraBuildFlags: >-
          -DBUILD_BOOST=OFF
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -83,8 +83,7 @@ jobs:
        -DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
        -DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
        -GNinja
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
+      cmakeBuildDir: 'llvm/build'
      installDir: '$(Build.BinariesDirectory)/llvm'
 # use llvm-lit to run unit tests for llvm, clang, and lld
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
@@ -122,8 +121,7 @@ jobs:
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
        -DCMAKE_BUILD_TYPE=Release
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
+      cmakeBuildDir: 'amd/device-libs/build'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      componentName: comgr
@@ -131,8 +129,7 @@ jobs:
        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
        -DCOMGR_DISABLE_SPIRV=1
        -DCMAKE_BUILD_TYPE=Release
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
+      cmakeBuildDir: 'amd/comgr/build'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: comgr
@@ -145,8 +142,7 @@ jobs:
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
        -DHIPCC_BACKWARD_COMPATIBILITY=OFF
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
+      cmakeBuildDir: 'amd/hipcc/build'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/rdc.yml
+++ b/.azuredevops/components/rdc.yml
@@ -105,7 +105,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
-        cmakeSourceDir: $(Build.SourcesDirectory)/grpc
        installDir: $(Build.SourcesDirectory)/bin
        extraBuildFlags: >-
          -DgRPC_INSTALL=ON
--- a/.azuredevops/components/rocAL.yml
+++ b/.azuredevops/components/rocAL.yml
@@ -125,7 +125,6 @@ jobs:
      parameters:
        componentName: PyBind11
        cmakeBuildDir: '$(Build.SourcesDirectory)/pybind11/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/pybind11'
        customInstallPath: false
        installEnabled: false
        extraBuildFlags: >-
@@ -142,7 +141,6 @@ jobs:
      parameters:
        componentName: RapidJSON
        cmakeBuildDir: '$(Build.SourcesDirectory)/rapidjson/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/rapidjson'
        customInstallPath: false
        installEnabled: false
        extraBuildFlags: >-
@@ -202,6 +200,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm/include/rocal
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -108,6 +108,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocJPEG.yml
+++ b/.azuredevops/components/rocJPEG.yml
@@ -114,6 +114,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -5,12 +5,6 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
- name: sparseCheckout
-  type: boolean
-  default: false
- name: sparseCheckoutDir
-  type: string
-  default: ''
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -72,8 +66,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckout: ${{ parameters.sparseCheckout }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -168,6 +168,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -105,7 +105,6 @@ jobs:
          -DLAPACKE=OFF
          -GNinja
        cmakeBuildDir: '$(Build.SourcesDirectory)/lapack/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/lapack'
        installDir: '$(Pipeline.Workspace)/deps-install'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -183,7 +183,6 @@ jobs:
      parameters:
        componentName: rocm-examples
        testDir: $(Build.SourcesDirectory)/build
-        testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml --exclude-regex "rocfft_callback"'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -167,6 +167,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/dependencies/grpc.yml
+++ b/.azuredevops/dependencies/grpc.yml
@@ -38,7 +38,6 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      cmakeBuildDir: $(Agent.BuildDirectory)/grpc/build
-      cmakeSourceDir: $(Agent.BuildDirectory)/grpc
      extraBuildFlags: >-
        -DgRPC_INSTALL=ON
        -DgRPC_BUILD_TESTS=OFF
--- a/.azuredevops/dependencies/gtest.yml
+++ b/.azuredevops/dependencies/gtest.yml
@@ -38,7 +38,6 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
-      cmakeSourceDir: $(Agent.BuildDirectory)/googletest
      extraBuildFlags: >-
        -DGTEST_FORCE_SHARED_CRT=ON
        -DCMAKE_DEBUG_POSTFIX=d
--- a/.azuredevops/templates/steps/build-cmake.yml
+++ b/.azuredevops/templates/steps/build-cmake.yml
@@ -10,10 +10,10 @@ parameters:
  default: ''
 - name: cmakeBuildDir
  type: string
-  default: $(Agent.BuildDirectory)/s/build
+  default: 'build'
 - name: cmakeSourceDir
  type: string
-  default: $(Agent.BuildDirectory)/s
+  default: '..'
 - name: customBuildTarget
  type: string
  default: ''
@@ -46,7 +46,7 @@ steps:
    ${{ if eq(parameters.customInstallPath, true) }}:
      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
    ${{ else }}:
-      cmakeArgs: ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
+      cmakeArgs: ${{ parameters.extraBuildFlags }} ..
 - ${{ if parameters.printDiskSpace }}:
  - script: df -h
    displayName: Disk space before build
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -4,12 +4,6 @@ parameters:
 - name: checkoutRepo
  type: string
  default: 'self'
- name: sparseCheckout
-  type: boolean
-  default: false
- name: sparseCheckoutDir
-  type: string
-  default: ''
 # submodule download behaviour
 # change to 'recursive' for repos with submodules
 - name: submoduleBehaviour
@@ -21,13 +15,3 @@ steps:
    clean: true
    submodules: ${{ parameters.submoduleBehaviour }}
    retryCountOnTaskFailure: 3
-    fetchFilter: blob:none
-    ${{ if eq(parameters.sparseCheckout, true) }}:
-      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
-      path: sparse
-  - ${{ if eq(parameters.sparseCheckout, true) }}:
-    - task: Bash@3
-      displayName: Symlink sparse checkout
-      inputs:
-        targetType: inline
-        script: ln -s $(Agent.BuildDirectory)/sparse/${{ parameters.sparseCheckoutDir }} $(Agent.BuildDirectory)/s
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -463,7 +463,7 @@ steps:
  displayName: 'List downloaded ROCm files'
  inputs:
    targetType: inline
-    script: ls -la1R $(Agent.BuildDirectory)/rocm
+    script: ls -1R $(Agent.BuildDirectory)/rocm
 - ${{ if eq(parameters.skipLibraryLinking, false) }}:
  - task: Bash@3
    displayName: 'Link ROCm shared libraries'
--- a/.azuredevops/templates/steps/docker-container.yml
+++ b/.azuredevops/templates/steps/docker-container.yml
@@ -106,7 +106,6 @@ parameters:
  type: object
  default:
    - gfx90a
-    - gfx942

 steps:
 # these steps should only be run if there was a failure or warning
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -32,10 +32,8 @@ Andrej
 Arb
 Autocast
 BARs
-BatchNorm
 BLAS
 BMC
-BabelStream
 Blit
 Blockwise
 Bluefield
@@ -126,7 +124,6 @@ FX
 Filesystem
 FindDb
 Flang
-FlashAttention
 FluxBenchmark
 Fortran
 Fuyu
@@ -141,7 +138,6 @@ GDR
 GDS
 GEMM
 GEMMs
-GFLOPS
 GFortran
 GFXIP
 Gemma
@@ -230,8 +226,6 @@ LM
 LSAN
 LSan
 LTS
-LSTMs
-LanguageCrossEntropy
 LoRA
 MEM
 MERCHANTABILITY
@@ -249,7 +243,6 @@ MMIOH
 MMU
 MNIST
 MPI
-MPT
 MSVC
 MVAPICH
 MVFFR
@@ -266,7 +259,6 @@ Meta's
 Miniconda
 MirroredStrategy
 Mixtral
-MosaicML
 Multicore
 Multithreaded
 MyEnvironment
@@ -275,7 +267,6 @@ NBIO
 NBIOs
 NCCL
 NCF
-NFS
 NIC
 NICs
 NLI
@@ -338,7 +329,6 @@ PipelineParallel
 PnP
 PowerEdge
 PowerShell
-Pretrained
 Pretraining
 Profiler's
 PyPi
@@ -386,7 +376,6 @@ Ryzen
 SALU
 SBIOS
 SCA
-ScaledGEMM
 SDK
 SDMA
 SDPA
@@ -427,8 +416,6 @@ TCI
 TCIU
 TCP
 TCR
-TensorRT
-TensorFloat
 TF
 TFLOPS
 TP
@@ -507,7 +494,6 @@ ZenDNN
 accuracies
 activations
 addr
-ade
 ai
 alloc
 allocatable
@@ -515,7 +501,6 @@ allocator
 allocators
 amdgpu
 api
-aten
 atmi
 atomics
 autogenerated
@@ -524,7 +509,6 @@ avx
 awk
 backend
 backends
-bb
 benchmarked
 benchmarking
 bfloat
@@ -548,7 +532,6 @@ cd
 centos
 centric
 changelog
-checkpointing
 chiplet
 cmake
 cmd
@@ -589,7 +572,6 @@ de
 deallocation
 debuggability
 debian
-deepseek
 denoise
 denoised
 denoises
@@ -613,7 +595,6 @@ embeddings
 enablement
 encodings
 endfor
-endif
 endpgm
 enqueue
 env
@@ -656,7 +637,6 @@ hipSPARSELt
 hipTensor
 hipamd
 hipblas
-hipcc
 hipcub
 hipfft
 hipfort
@@ -686,7 +666,6 @@ installable
 interop
 interprocedural
 intra
-intrinsics
 invariants
 invocating
 ipo
@@ -716,7 +695,6 @@ migratable
 miopen
 miopengemm
 mivisionx
-mixtral
 mjx
 mkdir
 mlirmiopen
@@ -833,7 +811,6 @@ roctracer
 rst
 runtime
 runtimes
-ResNet
 sL
 scalability
 scalable
@@ -849,7 +826,6 @@ sm
 smi
 softmax
 spack
-spmm
 src
 stochastically
 strided
@@ -858,10 +834,8 @@ subdirectory
 subexpression
 subfolder
 subfolders
-submatrix
 submodule
 submodules
-subnet
 supercomputing
 symlink
 symlinks
@@ -883,7 +857,6 @@ torchvision
 tqdm
 tracebacks
 txt
-TopK
 uarch
 uncached
 uncacheable
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,7 @@ different versions of the ROCm software stack and its components.

 ## ROCm 6.4.1

-See the [ROCm 6.4.1 release notes](https://rocm.docs.amd.com/en/docs-6.4.1/about/release-notes.html)
+See the [ROCm 6.4.1 release notes](https://rocm-stg.amd.com/en/latest/about/release-notes.html)
 for a complete overview of this release.

 ### **AMD SMI** (25.4.2)
@@ -16,24 +16,11 @@ for a complete overview of this release.
 * Dumping CPER entries from RAS tool `amdsmi_get_gpu_cper_entries()` to Python and C APIs.
  - Dumping CPER entries consist of `amdsmi_cper_hdr_t`.
  - Dumping CPER entries is also enabled in the CLI interface through `sudo amd-smi ras --cper`.
-* `amdsmi_get_gpu_busy_percent` to the C API.

-#### Changed
-
-* Modified VRAM display for `amd-smi monitor -v`. 
-
-#### Optimized
-
-* Improved load times for CLI commands when the GPU has multiple parititons.
-
-#### Resolved issues
+#### Resolved

 * Fixed partition enumeration in `amd-smi list -e`, `amdsmi_get_gpu_enumeration_info()`, `amdsmi_enumeration_info_t`, `drm_card`, and `drm_render` fields.

-#### Known issues
-
-* When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.
-
 ```{note}
 See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
 ```
@@ -42,22 +29,20 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc

 #### Added

-* New log mask enumeration `LOG_COMGR` enables logging precise code object information.
+* New debug mask, to print precise code object information for logging.

 #### Changed

-* HIP runtime uses device bitcode before SPIRV.
-* The implementation of preventing `hipLaunchKernel` latency degradation with number of idle streams is reverted or disabled by default.
+* Calling the code object has changed. HIP runtime now uses device bitcode before SPIR-V.

 #### Optimized

-* Improved kernel logging includes de-mangling shader names.
-* Refined implementation in HIP APIs `hipEventRecords` and `hipStreamWaitEvent` for performance improvement.
+* Improved kernel logging using the demangling shader names.

 #### Resolved issues

-* Stale state during the graph capture. The return error was fixed, HIP runtime now always uses the latest dependent nodes during `hipEventRecord` capture.
-* Segmentation fault during kernel execution. HIP runtime now allows maximum stack size as per ISA on the GPU device.
+* Stale state during the graph capture. The return error was fixed, and HIP runtime now always uses the latest dependent nodes during `hipEventRecord` capture.
+* Issue of `hipEventRecords` failing to call the `hip::getStream` runtime function.

 ### **hipBLASLt** (0.12.1)

@@ -76,16 +61,6 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
 * Fixed an issue where early termination, in rare circumstances, could cause the application to stop responding by adding synchronization before destroying a proxy thread.
 * Fixed the accuracy issue for the MSCCLPP `allreduce7` kernel in graph mode.

-#### Known issues
-
-* When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault. The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
-  This issue will be fixed in a future ROCm release.
-
-* Within the RCCL-UnitTests test suite, failures occur in tests ending with the
-  `.ManagedMem` and `.ManagedMemGraph` suffixes. These failures only affect the
-  test results and do not affect the RCCL component itself. This issue will be
-  resolved in a future ROCm release.
-
 ### **rocALUTION** (3.2.3)

 #### Added
@@ -125,7 +100,7 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/rele

 #### Added 

-* How-to document for [network performance profiling](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/nic-profiling.html) for standard Network Interface Cards (NICs).
+* How-to document for [network performance profiling](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/amd-staging/how-to/nic-profiling.html) for standard Network Interface Cards (NICs).

 #### Resolved issues

@@ -876,10 +851,6 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/rele
 #### Added 

 - Support for VA-API and rocDecode tracing.
- Aggregation of MPI data collected across distributed nodes and ranks. The data is concatenated into a single proto file.
-
-#### Changed
- Backend refactored to use [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) rather than [ROCProfiler](https://github.com/ROCm/rocprofiler) and [ROCTracer](https://github.com/ROCm/ROCTracer).

 #### Resolved issues

@@ -890,21 +861,9 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/rele
 - Fixed interruption in config file generation.

 - Fixed segmentation fault while running rocprof-sys-instrument.
- Fixed an issue where running `rocprof-sys-causal` or using the `-I all` option with `rocprof-sys-sample` caused the system to become non-responsive.
-
- Fixed an issue where sampling multi-GPU Python workloads caused the system to stop responding.
-
-### **ROCm Validation Suite** (1.1.0)
-
-#### Added
-
-* Configuration files for MI210.
-* Support for OCP fp8 data type.
-* GPU index-based CLI execution.

 #### Changed
-
-* JSON logging with updated schema.
+- Backend refactored to use [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) rather than [ROCProfiler](https://github.com/ROCm/rocprofiler) and [ROCTracer](https://github.com/ROCm/ROCTracer).

 ### **rocPRIM** (3.4.0)

--- a/Manifest6.4.0
+++ b/Manifest6.4.0
@@ -0,0 +1,81 @@
+This XML file does not appear to have any style information associated with it. The document tree is shown below.
+<manifest>
+<remote name="gerritgit" fetch="ssh://gerritgit/" review="gerrit-git.amd.com"/>
+<remote name="lightning-ghemu" fetch="ssh://github-emu/AMD-Lightning-Internal"/>
+<remote name="rocm" fetch="https://github.com/ROCm"/>
+<remote name="rocm-ghemu" fetch="ssh://github-emu/AMD-ROCm-Internal"/>
+<default remote="gerritgit" revision="release/rocm-rel-6.4" sync-j="4" sync-c="true"/>
+<project name="AMDMIGraphX" remote="rocm" revision="908b94a3f0822a4fee89d99c3cfc51cd9c93f2f6" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="MIOpen" remote="rocm" revision="f10c6ed8085cfabf8877294ab44301d8180999e8" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="MIVisionX" remote="rocm" revision="a2b69e5b30f2dbdf66055ec99a2b5559b572f7af" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="OpenCL-CLHPP" remote="rocm-ghemu" revision="6f7e82dee83aea7f277a4b874da309902ea51f6e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="OpenCL-Headers" remote="rocm-ghemu" revision="848d67b6fd471318816a81601d469b086487d18e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="ROCR-Runtime" remote="rocm-ghemu" revision="1d9f08cabd33bd6302add72d0be2bfe0e64eea3a" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="ROCdbgapi" remote="rocm-ghemu" revision="59be7ff0aaafe82feb78f30990c8fdf62838cc98" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="ROCgdb" remote="rocm-ghemu" revision="401bb21f2f3c72bbb90ccce12dc3ef481f9a1d8a" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="ROCmValidationSuite" remote="rocm" revision="5f1a9665f6241b0346c88cfd21a6073628da3593" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="Tensile" remote="rocm" revision="be49885fce2a61b600ae4593f1c2d00c8b4fa11e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="TransferBench" remote="rocm" revision="3ea2f226ec818158ba97e4ee0ec0b589f13f4641" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="amdsmi" remote="rocm-ghemu" revision="e6a209ef809f1b09a424572afd685ec754a9042b" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="aomp" path="openmp-extras/aomp" remote="lightning-ghemu" revision="24932c59c0759a57ee52d327d9a10a2e466e35a7" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="llvmdeps,stage1"/>
+<project name="aomp-extras" path="openmp-extras/aomp-extras" remote="lightning-ghemu" revision="6f8038ada9dec082ea091d30c98c0834669d12a1" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="llvmdeps,stage1"/>
+<project name="aqlprofile" remote="rocm-ghemu" revision="7fae75ec6bf7b1a631707ae859542d733f8a1f43" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="build-infra" path="ROCm" remote="rocm-ghemu" revision="811ec9cc6d1588bf66619365b9b4db96ac6acf68" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="infra"/>
+<project name="clr" remote="rocm-ghemu" revision="a1adcfdd44f4560c0268e36c8afeb94f760dc963" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="composable_kernel" remote="rocm" revision="a8c5bd9b9ad950c3e742877e01cb784da91664e3" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="compute/ec/hip-examples" path="HIP-Examples" revision="41b0cff8077a25390c2bbda827eb9f6f37ec1ef3" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="compute/ec/hip-examples-private" path="hip-examples-private" revision="dc69edb405804987753735a369478503d82ce9c2" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="compute/ec/jenkins-utils" path="jenkins-utils" revision="bb517b014ff055b62d3860addc23ddd06b0c3e6e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="infra,stage1"/>
+<project name="compute/ec/ml-framework-ci" path="ml-framework-ci" revision="83440e22ebf1e9443b6df737224c1e5e2b91e0c4" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="infra,framework"/>
+<project name="compute/ec/packaging/meta" path="meta" revision="c7cffa2e4199da1fd68b8b3568282dd59d49a4df" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="infra,stage1"/>
+<project name="compute/ec/prototype" path="build" revision="d71a2766e11e057e5c698caea8fc4ebc0f72cb3e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="infra,stage1"/>
+<project name="compute/ec/rocm_bandwidth_test" path="rocm_bandwidth_test" revision="84b8ddd2686be9bd3e438126b44e6bb10d94d522" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="flang" path="openmp-extras/flang" remote="lightning-ghemu" revision="390169508a03cecf85d43f5cee41e223355f598f" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="llvmdeps,stage1"/>
+<project name="half" remote="rocm" revision="1ddada225144cac0de8f6b5c0dd9acffd99a2e68" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hip" remote="rocm-ghemu" revision="22b0b2eb9a09e30dca11b213872127f9caa2e1e7" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="hip-tests" remote="rocm-ghemu" revision="dc28111737706aad93e38c2f746ccbc13dbf1b80" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="hipBLAS" remote="rocm" revision="0a335435e9c8a833d7106e4ae5057eb58cea2fef" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipBLAS-common" remote="rocm" revision="7c1566ba4628e777b91511242899b6df48555d04" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipBLASLt" remote="rocm" revision="4d62e135cfb4008cf7b508995cad347a1bc750c8" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipCUB" remote="rocm" revision="a6005943c5804535990429925318e7900eb6e801" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipFFT" remote="rocm" revision="396169c84a2bb3c7ed7245caefe66002138e7c6c" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipRAND" remote="rocm" revision="d2516cc199690fd91abfdc5908ecfd88e3553067" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipSOLVER" remote="rocm" revision="ca0de3c9c95df4345b76cd8a56e72c84b7d5fc79" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipSPARSE" remote="rocm" revision="a6c62e48eb8a2326475f7bbb4705c5b926a5edc8" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipSPARSELt" remote="rocm" revision="f3f4f590a49ae9f9c9ce1451c42db4c2bfd00eed" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipTensor" remote="rocm" revision="e5529b92914be79e4887a92b48b30f88b616c9a5" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipfort" remote="rocm" revision="f3d6aa3e8657d665a43fa2815ca2e49ce39a464a" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="hipify" path="HIPIFY" remote="lightning-ghemu" revision="ed0de49132211c6ddbd40f5cd89b5841e832ac3d" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="llvmdeps,stage1"/>
+<project name="hipother" remote="rocm-ghemu" revision="49b1588f834dbe1a4db1bddb3647a91b15f618b8" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="llvm-project" remote="lightning-ghemu" revision="aa0c041cb49bb50af268504907b7899fec59ae4e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="llvmdeps,stage1"/>
+<project name="rccl" remote="rocm" revision="12f8f61f3a5db87bf158c60fdd5e38a32c903b08" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rdc" remote="rocm-ghemu" revision="0224310c872df0fae56ffc883c50c7f47dc82870" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocAL" remote="rocm" revision="373ef865aca43528559e7a9134f09e49a9e9b7c6" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocALUTION" remote="rocm" revision="cb256de3574a4fcbc6a52ed5986b787173cd6dc2" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocBLAS" remote="rocm" revision="80e5394d6a68901ce48b03da47b33b1e69d58be7" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocDecode" remote="rocm" revision="a2a7b63cad8f90a94e21232b44460a8fb2d52304" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocFFT" remote="rocm" revision="058ba87fdcfdae334dbc8dbe048955b248e9328a" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocJPEG" remote="rocm" revision="73d36d35d90137ffbfcec276bdf973823ef0c0b9" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocPRIM" remote="rocm" revision="d8771ec18ad45c4d697800c22fb21241f22a915f" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocPyDecode" remote="rocm" revision="848e49d29d4d6173fb4b57a9223ce68c049baa28" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocRAND" remote="rocm" revision="4d5d3a88d1898705dadf5c06e7b0400d51a13c36" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocSHMEM" remote="rocm" revision="7702b3c0f3f41baf6a80aa6b22fa90dec1a6801e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocSOLVER" remote="rocm" revision="db754e3f55daab54abb86f17cd6b4066c504e163" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocSPARSE" remote="rocm" revision="4953add0aee37ad26700e8bcd6defbfa6b3a4d08" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocThrust" remote="rocm" revision="6bf2777019827e1a2898547ced9a03bf5024ed7d" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocWMMA" remote="rocm" revision="1a5b6231663fcf3e00abf790aeae843278f16a65" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocm-cmake" remote="rocm" revision="ecc716b97c2239cff00422ed7a43cd52a0839a0e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="rocm-core" remote="rocm-ghemu" revision="73dae9c82ace4fb8e1e4028f86ff0365f21c9f51" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="infra,stage1"/>
+<project name="rocm-examples" remote="rocm" revision="3bbd2987a3b46cfd2c8348c2317042f3ad604e38" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="rocm_smi_lib" remote="rocm-ghemu" revision="1f242d314916336d6ce5c731f486edfaa8f0b987" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="rocminfo" remote="rocm-ghemu" revision="6ea2ba38c8e1ab2899acf66878148b1192fd0bee" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="rocprofiler" remote="rocm-ghemu" revision="40da7312a06f8052f5c148a4709cab64686f881d" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="rocprofiler-compute" remote="rocm" revision="7b25d958b4e030ea64a24ed0a62dcac1e48193ab" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="rocprofiler-register-internal" path="rocprofiler-register" remote="rocm-ghemu" revision="7c6cd44f637d400b50b803b0b351be302ad6827d" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="rocprofiler-sdk-internal" path="rocprofiler-sdk" remote="rocm-ghemu" revision="e8e49fe76971000a42a5a177d9a727d16dd0ebcf" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="rocprofiler-systems" remote="rocm" revision="2e945e4a08781e13a822f568814e2c434fd8858f" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="rocr_debug_agent" remote="rocm-ghemu" revision="9eec1a52a36b5203bbac54a1b442fe9a45b6a43e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="roctracer" remote="rocm-ghemu" revision="f55a6943816641c081aa167c8a45904ddae2ba5e" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="stage1"/>
+<project name="rpp" remote="rocm" revision="5fb204ca7018b87889e061b720c5b06f6b9bce9b" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="mathlibs"/>
+<project name="spirv-llvm-translator" path="llvm-project/llvm/projects/SPIRV-LLVM-Translator" remote="lightning-ghemu" revision="ae12ddbec86765df369b18ac764e170082079819" upstream="release/rocm-rel-6.4" dest-branch="release/rocm-rel-6.4" groups="llvmdeps,stage1"/>
+</manifest>
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ The following example shows how to use the repo tool to download the ROCm source
 ```bash
 mkdir -p ~/ROCm/
 cd ~/ROCm/
-export ROCM_VERSION=6.4.1
+export ROCM_VERSION=6.4.0
 ~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
 ~/bin/repo sync
 ```
@@ -77,7 +77,7 @@ The Build time will reduce significantly if we limit the GPU Architecture/s agai

 mkdir -p ~/WORKSPACE/      # Or any folder name other than WORKSPACE
 cd ~/WORKSPACE/
-export ROCM_VERSION=6.4.1
+export ROCM_VERSION=6.4.0
 ~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
 ~/bin/repo sync

@@ -127,7 +127,6 @@ bash install-prerequisites.sh
 export GPU_ARCHS="gfx942"               # Example
 export GPU_ARCHS="gfx940;gfx941;gfx942" # Example

-cd ~/WORKSPACE/
 # Pick and run build commands in the docker container:
 # Build rocm-dev packages
 make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -24,6 +24,8 @@ The release notes provide a summary of notable changes since the previous ROCm r

 - [ROCm known issues](#rocm-known-issues)

+- [ROCm resolved issues](#rocm-resolved-issues)
+
 - [ROCm upcoming changes](#rocm-upcoming-changes)

 ```{note}
@@ -41,7 +43,6 @@ The following are notable new features and improvements in ROCm 6.4.1. For chang
 AMD Instinct MI300X now supports DPX partition mode under NPS2 memory mode. For more partitioning information, see the [Deep dive into the MI300 compute and memory partition modes](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html) blog and [AMD Instinct MI300X system optimization](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#change-gpu-partition-modes).

 ### Introducing the ROCm Data Science toolkit
-
 The ROCm Data Science toolkit (or ROCm-DS) is an open-source software collection for high-performance data science applications built on the core ROCm platform. You can leverage ROCm-DS to accelerate both new and existing data science workloads, allowing you to execute intensive applications with larger datasets at lightning speed. ROCm-DS is in an early access state. Running production workloads is not recommended. For more information, see [AMD ROCm-DS Documentation](https://rocm.docs.amd.com/projects/rocm-ds/en/latest/index.html).

 ### ROCm Offline Installer Creator updates
@@ -55,7 +56,7 @@ The ROCm Runfile Installer 6.4.1 adds the following improvements:
 - Performance improvements for detecting a previous ROCm install. 
 - Removal of the extra `opt` directory created for the target during the ROCm installation.  For example, installing to `target=/home/amd` now installs ROCm to `/home/amd/rocm-6.4.1` and not `/home/amd/opt/rocm-6.4.1`. For installs using `target=/`, the installation will continue to use `/opt/`.
 - The Runfile Installer can be used to uninstall any Runfile-based installation of the driver.
- In the CLI interface, the `postrocm` argument can now be run separately from the `rocm` argument.  In cases where `postrocm` was missed from the initial ROCm install, `postrocm` can now be run on the same target folder. For example, if you installed ROCm 6.4.1 using `install.run target=/myrocm rocm`, you can run the post-installation separately using the command `install.run target=/myrocm/rocm-6.4.1 postrocm`.
+- In the CLI interface, The `postrocm` argument can now be run separately from the `rocm` argument.  In cases where `postrocm` was missed from the initial ROCm install, `postrocm` can now be run on the same target folder. For example, if you install ROCm 6.4.1 using: `install.run target=/myrocm rocm` you can run the post-installation separately using the command `install.run target=/myrocm/rocm-6.4.1 postrocm`.

 For more information, see [ROCm Runfile Installer](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/rocm-runfile-installer.html).

@@ -63,24 +64,19 @@ For more information, see [ROCm Runfile Installer](https://rocm.docs.amd.com/pro

 ROCm documentation continues to be updated to provide clearer and more comprehensive guidance for a wider variety of user needs and use cases.

-* [Tutorials for AI developers](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/) have been expanded with five new tutorials. These tutorials are Jupyter notebook-based, easy-to-follow documents. They are ideal for AI developers who want to learn about specific topics, including inference, fine-tuning, and training. For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).
+* [Tutorials for AI developers](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/) have been expanded with five
+ new tutorials. These tutorials are Jupyter notebook-based, easy-to-follow documents. They are ideal for AI developers who want to learn about specific topics, including inference, fine-tuning, and training.
 * The [Training a model with LLM Foundry](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.html) performance testing guide has been added. This guide describes how to use the preconfigured [ROCm/pytorch-training](https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5) training environment and [https://github.com/ROCm/MAD](https://github.com/ROCm/MAD) to test the training performance of the LLM Foundry framework on AMD Instinct MI325X and MI300X accelerators using the [MPT-30B](https://huggingface.co/mosaicml/mpt-30b) model.
 * The [Training a model with PyTorch](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html) performance testing guide has been updated to feature the latest [ROCm/pytorch-training](https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5) Docker image (a preconfigured training environment with ROCm and PyTorch). Support for [Llama 3.3 70B](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) has been added.
 * The [Training a model with JAX MaxText](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.html) performance testing guide has been updated to feature the latest [ROCm/jax-training](https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b) Docker image (a preconfigured training environment with ROCm, JAX, and [MaxText](https://github.com/AI-Hypercomputer/maxtext)). Support for [Llama 3.3 70B](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) has been added.
-* The [vLLM inference performance testing](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference/vllm-benchmark.html?model=pyt_vllm_qwq-32b) guide has been updated to feature the latest [ROCm/vLLM](https://hub.docker.com/layers/rocm/vllm/latest/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4) Docker image (a preconfigured environment for inference with ROCm and [vLLM](https://docs.vllm.ai/en/latest/)). Support for the [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) model has been added.
+* The [vLLM inference performance testing](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference/vllm-benchmark.html?model=pyt_vllm_qwq-32b) guide has been updated to feature the latest [ROCm/vLLM](https://hub.docker.com/layers/rocm/vllm/instinct_main/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845) Docker image (a preconfigured environment for inference with ROCm and [vLLM](https://docs.vllm.ai/en/latest/)). Support for the [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) model has been added.
 * The [PyTorch inference performance testing](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.html?model=pyt_clip_inference) guide has been added, featuring the [ROCm/PyTorch](https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-ab1d350b818b90123cfda31363019d11c0d41a8f12a19e3cb2cb40cf0261137d) Docker image (a preconfigured inference environment with ROCm and PyTorch) with initial support for the [CLIP](https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K) and [Chai-1](https://huggingface.co/chaidiscovery/chai-1) models.
+* The [Data types and precision support](https://rocm.docs.amd.com/en/latest/reference/precision-support.html) topic has been updated with new information in the library's precision support list.
+* The deep learning frameworks compatibility pages have been updated with new information and are reorganized, making them easier to review. For more information, see [PyTorch compatibility](https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html), [TensorFlow compatibility](https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html), and [JAX compatibility](https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html).

 ## Operating system and hardware support changes

-ROCm 6.4.1 introduces support for the RDNA4 architecture-based [Radeon AI PRO
-R9700](https://www.amd.com/en/products/graphics/workstations/radeon-ai-pro/ai-9000-series/amd-radeon-ai-pro-r9700.html),
-[Radeon RX 9070](https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9070.html),
-[Radeon RX 9070 XT](https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9070xt.html),
-Radeon RX 9070 GRE, and
-[Radeon RX 9060 XT](https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9060xt.html) GPUs
-for compute workloads. It also adds support for RDNA3 architecture-based [Radeon PRO W7700](https://www.amd.com/en/products/graphics/workstations/radeon-pro/w7700.html) and [Radeon RX 7800 XT](https://www.amd.com/en/products/graphics/desktops/radeon/7000-series/amd-radeon-rx-7800-xt.html) GPUs. These GPUs are supported on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
-For details, see the full list of [Supported GPUs
-(Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus).
+Operating system and hardware support remain unchanged in this release.

 See the [Compatibility
 matrix](../../docs/compatibility/compatibility-matrix.rst)
@@ -111,47 +107,47 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="9">Libraries</th>
                <th rowspan="9">Machine learning and computer vision</th>
-                <td><a href="https://rocm.docs.amd.com/projects/composable_kernel/en/docs-6.4.1/index.html">Composable Kernel</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/composable_kernel/en/docs-6.4.0/index.html">Composable Kernel</a></td>
                <td>1.1.0</td>
                <td><a href="https://github.com/ROCm/composable_kernel"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/AMDMIGraphX/en/docs-6.4.1/index.html">MIGraphX</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/AMDMIGraphX/en/docs-6.4.0/index.html">MIGraphX</a></td>
                <td>2.12.0</td>
                <td><a href="https://github.com/ROCm/AMDMIGraphX"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/MIOpen/en/docs-6.4.1/index.html">MIOpen</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/MIOpen/en/docs-6.4.0/index.html">MIOpen</a></td>
                <td>3.4.0</td>
                <td><a href="https://github.com/ROCm/MIOpen"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/MIVisionX/en/docs-6.4.1/index.html">MIVisionX</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/MIVisionX/en/docs-6.4.0/index.html">MIVisionX</a></td>
                <td>3.2.0</td>
                <td><a href="https://github.com/ROCm/MIVisionX"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocAL/en/docs-6.4.1/index.html">rocAL</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocAL/en/docs-6.4.0/index.html">rocAL</a></td>
                <td>2.2.0</td>
                <td><a href="https://github.com/ROCm/rocAL"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocDecode/en/docs-6.4.1/index.html">rocDecode</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocDecode/en/docs-6.4.0/index.html">rocDecode</a></td>
                <td>0.10.0</td>
                <td><a href="https://github.com/ROCm/rocDecode"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocJPEG/en/docs-6.4.1/index.html">rocJPEG</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocJPEG/en/docs-6.4.0/index.html">rocJPEG</a></td>
                <td>0.8.0</td>
                <td><a href="https://github.com/ROCm/rocJPEG"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocPyDecode/en/docs-6.4.1/index.html">rocPyDecode</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocPyDecode/en/docs-6.4.0/index.html">rocPyDecode</a></td>
                <td>0.3.1</td>
                <td><a href="https://github.com/ROCm/rocPyDecode"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rpp/en/docs-6.4.1/index.html">RPP</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rpp/en/docs-6.4.0/index.html">RPP</a></td>
                <td>1.9.10</td>
                <td><a href="https://github.com/ROCm/rpp"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -160,12 +156,12 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="2"></th>
                <th rowspan="2">Communication</th>
-                <td><a href="https://rocm.docs.amd.com/projects/rccl/en/docs-6.4.1/index.html">RCCL</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rccl/en/docs-6.4.0/index.html">RCCL</a></td>
                <td>2.22.3&nbsp;&Rightarrow;&nbsp;<a href="#rccl-2-22-3">2.22.3</td>
                <td><a href="https://github.com/ROCm/rccl"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-            <td><a href="https://rocm.docs.amd.com/projects/rocSHMEM/en/docs-6.4.1/index.html">rocSHMEM</a></td>
+            <td><a href="https://github.com/ROCm/rocSHMEM">rocSHMEM</a></td>
                <td>2.0.0</td>
                <td><a href="https://github.com/ROCm/rocSHMEM"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -174,82 +170,82 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="16"></th>
                <th rowspan="16">Math</th>
-                <td><a href="https://rocm.docs.amd.com/projects/hipBLAS/en/docs-6.4.1/index.html">hipBLAS</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipBLAS/en/docs-6.4.0/index.html">hipBLAS</a></td>
                <td>2.4.0</td>
                <td><a href="https://github.com/ROCm/hipBLAS"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipBLASLt/en/docs-6.4.1/index.html">hipBLASLt</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipBLASLt/en/docs-6.4.0/index.html">hipBLASLt</a></td>
                <td>0.12.0&nbsp;&Rightarrow;&nbsp;<a href="#hipblaslt-0-12-1">0.12.1</td>
                <td><a href="https://github.com/ROCm/hipBLASLt"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-6.4.1/index.html">hipFFT</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-6.4.0/index.html">hipFFT</a></td>
                <td>1.0.18</td>
                <td><a href="https://github.com/ROCm/hipFFT"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipfort/en/docs-6.4.1/index.html">hipfort</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipfort/en/docs-6.4.0/index.html">hipfort</a></td>
                <td>0.6.0</td>
                <td><a href="https://github.com/ROCm/hipfort"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipRAND/en/docs-6.4.1/index.html">hipRAND</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipRAND/en/docs-6.4.0/index.html">hipRAND</a></td>
                <td>2.12.0</td>
                <td><a href="https://github.com/ROCm/hipRAND"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-6.4.1/index.html">hipSOLVER</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-6.4.0/index.html">hipSOLVER</a></td>
                <td>2.4.0</td>
                <td><a href="https://github.com/ROCm/hipSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-6.4.1/index.html">hipSPARSE</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-6.4.0/index.html">hipSPARSE</a></td>
                <td>3.2.0</td>
                <td><a href="https://github.com/ROCm/hipSPARSE"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSELt/en/docs-6.4.1/index.html">hipSPARSELt</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSELt/en/docs-6.4.0/index.html">hipSPARSELt</a></td>
                <td>0.2.3</td>
                <td><a href="https://github.com/ROCm/hipSPARSELt"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocALUTION/en/docs-6.4.1/index.html">rocALUTION</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocALUTION/en/docs-6.4.0/index.html">rocALUTION</a></td>
                <td>3.2.2&nbsp;&Rightarrow;&nbsp;<a href="#rocalution-3-2-3">3.2.3</td></td>
                <td><a href="https://github.com/ROCm/rocALUTION"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocBLAS/en/docs-6.4.1/index.html">rocBLAS</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocBLAS/en/docs-6.4.0/index.html">rocBLAS</a></td>
                <td>4.4.0</td>
                <td><a href="https://github.com/ROCm/rocBLAS"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocFFT/en/docs-6.4.1/index.html">rocFFT</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocFFT/en/docs-6.4.0/index.html">rocFFT</a></td>
                <td>1.0.32</td>
                <td><a href="https://github.com/ROCm/rocFFT"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocRAND/en/docs-6.4.1/index.html">rocRAND</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocRAND/en/docs-6.4.0/index.html">rocRAND</a></td>
                <td>3.3.0</td>
                <td><a href="https://github.com/ROCm/rocRAND"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-6.4.1/index.html">rocSOLVER</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-6.4.0/index.html">rocSOLVER</a></td>
                <td>3.28.0</td>
                <td><a href="https://github.com/ROCm/rocSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-6.4.1/index.html">rocSPARSE</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-6.4.0/index.html">rocSPARSE</a></td>
                <td>3.4.0</td>
                <td><a href="https://github.com/ROCm/rocSPARSE"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocWMMA/en/docs-6.4.1/index.html">rocWMMA</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocWMMA/en/docs-6.4.0/index.html">rocWMMA</a></td>
                <td>1.7.0</td>
                <td><a href="https://github.com/ROCm/rocWMMA"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/Tensile/en/docs-6.4.1/src/index.html">Tensile</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/Tensile/en/docs-6.4.0/src/index.html">Tensile</a></td>
                <td>4.43.0</td>
                <td><a href="https://github.com/ROCm/Tensile"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -258,22 +254,22 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="4"></th>
                <th rowspan="4">Primitives</th>
-                <td><a href="https://rocm.docs.amd.com/projects/hipCUB/en/docs-6.4.1/index.html">hipCUB</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipCUB/en/docs-6.4.0/index.html">hipCUB</a></td>
                <td>3.4.0</td>
                <td><a href="https://github.com/ROCm/hipCUB"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipTensor/en/docs-6.4.1/index.html">hipTensor</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipTensor/en/docs-6.4.0/index.html">hipTensor</a></td>
                <td>1.5.0</td>
                <td><a href="https://github.com/ROCm/hipTensor"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocPRIM/en/docs-6.4.1/index.html">rocPRIM</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocPRIM/en/docs-6.4.0/index.html">rocPRIM</a></td>
                <td>3.4.0</td>
                <td><a href="https://github.com/ROCm/rocPRIM"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocThrust/en/docs-6.4.1/index.html">rocThrust</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocThrust/en/docs-6.4.0/index.html">rocThrust</a></td>
                <td>3.3.0</td>
                <td><a href="https://github.com/ROCm/rocThrust"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -282,27 +278,27 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="7">Tools</th>
                <th rowspan="7">System management</th>
-                <td><a href="https://rocm.docs.amd.com/projects/amdsmi/en/docs-6.4.1/index.html">AMD SMI</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/amdsmi/en/docs-6.4.0/index.html">AMD SMI</a></td>
                <td>25.3.0&nbsp;&Rightarrow;&nbsp;<a href="#amd-smi-25-4-2">25.4.2</a></td>
                <td><a href="https://github.com/ROCm/amdsmi"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rdc/en/docs-6.4.1/index.html">ROCm Data Center Tool</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rdc/en/docs-6.4.0/index.html">ROCm Data Center Tool</a></td>
                <td>0.3.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-data-center-tool-0-3-0">0.3.0</td>
                <td><a href="https://github.com/ROCm/rdc"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocminfo/en/docs-6.4.1/index.html">rocminfo</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocminfo/en/docs-6.4.0/index.html">rocminfo</a></td>
                <td>1.0.0</td>
                <td><a href="https://github.com/ROCm/rocminfo"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocm_smi_lib/en/docs-6.4.1/index.html">ROCm SMI</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocm_smi_lib/en/docs-6.4.0/index.html">ROCm SMI</a></td>
                <td>7.5.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-smi-7-5-0">7.5.0</a></td>
                <td><a href="https://github.com/ROCm/rocm_smi_lib"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/docs-6.4.1/index.html">ROCmValidationSuite</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/docs-6.4.0/index.html">ROCmValidationSuite</a></td>
                <td>1.1.0</td>
                <td><a href="https://github.com/ROCm/ROCmValidationSuite"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -311,38 +307,38 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="6"></th>
                <th rowspan="6">Performance</th>
-                <td><a href="https://rocm.docs.amd.com/projects/rocm_bandwidth_test/en/docs-6.4.1/index.html">ROCm Bandwidth
+                <td><a href="https://rocm.docs.amd.com/projects/rocm_bandwidth_test/en/docs-6.4.0/index.html">ROCm Bandwidth
                        Test</a></td>
                <td>1.4.0</td>
                <td><a href="https://github.com/ROCm/rocm_bandwidth_test/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-compute/en/docs-6.4.1/index.html">ROCm Compute Profiler</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-compute/en/docs-6.4.0/index.html">ROCm Compute Profiler</a></td>
                <td>3.1.0</td>
                <td><a href="https://github.com/ROCm/rocprofiler-compute"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-systems/en/docs-6.4.1/index.html">ROCm Systems Profiler</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-systems/en/docs-6.4.0/index.html">ROCm Systems Profiler</a></td>
                <td>1.0.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-systems-profiler-1-0-1">1.0.1</td>
                <td><a href="https://github.com/ROCm/rocprofiler-systems"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.4.1/index.html">ROCProfiler</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.4.0/index.html">ROCProfiler</a></td>
                <td>2.0.0</td>
                <td><a href="https://github.com/ROCm/ROCProfiler/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/docs-6.4.1/index.html">ROCprofiler-SDK</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/docs-6.4.0/index.html">ROCprofiler-SDK</a></td>
                <td>0.6.0</td>
                <td><a href="https://github.com/ROCm/rocprofiler-sdk/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr >
-                <td><a href="https://rocm.docs.amd.com/projects/roctracer/en/docs-6.4.1/index.html">ROCTracer</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/roctracer/en/docs-6.4.0/index.html">ROCTracer</a></td>
                <td>4.1.0</td>
                <td><a href="https://github.com/ROCm/ROCTracer/"><i
                            class="fab fa-github fa-lg"></i></a></td>
@@ -352,32 +348,32 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="5"></th>
                <th rowspan="5">Development</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIPIFY/en/docs-6.4.1/index.html">HIPIFY</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/HIPIFY/en/docs-6.4.0/index.html">HIPIFY</a></td>
                <td>19.0.0</td>
                <td><a href="https://github.com/ROCm/HIPIFY/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCdbgapi/en/docs-6.4.1/index.html">ROCdbgapi</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCdbgapi/en/docs-6.4.0/index.html">ROCdbgapi</a></td>
                <td>0.77.2</td>
                <td><a href="https://github.com/ROCm/ROCdbgapi/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCmCMakeBuildTools/en/docs-6.4.1/index.html">ROCm CMake</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCmCMakeBuildTools/en/docs-6.4.0/index.html">ROCm CMake</a></td>
                <td>0.14.0</td>
                <td><a href="https://github.com/ROCm/rocm-cmake/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCgdb/en/docs-6.4.1/index.html">ROCm Debugger (ROCgdb)</a>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCgdb/en/docs-6.4.0/index.html">ROCm Debugger (ROCgdb)</a>
                </td>
                <td>15.2</td>
                <td><a href="https://github.com/ROCm/ROCgdb/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocr_debug_agent/en/docs-6.4.1/index.html">ROCr Debug Agent</a>
+                <td><a href="https://rocm.docs.amd.com/projects/rocr_debug_agent/en/docs-6.4.0/index.html">ROCr Debug Agent</a>
                </td>
                <td>2.0.4</td>
                <td><a href="https://github.com/ROCm/rocr_debug_agent/"><i
@@ -387,13 +383,13 @@ Click {fab}`github` to go to the component's source code on GitHub.
        <tbody class="rocm-components-compilers tbody-reverse-zebra">
            <tr>
                <th rowspan="2" colspan="2">Compilers</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIPCC/en/docs-6.4.1/index.html">HIPCC</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/HIPCC/en/docs-6.4.0/index.html">HIPCC</a></td>
                <td>1.1.1</td>
-                <td><a href="https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc"><i
+                <td><a href="https://github.com/ROCm/llvm-project/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/llvm-project/en/docs-6.4.1/index.html">llvm-project</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/llvm-project/en/docs-6.4.0/index.html">llvm-project</a></td>
                <td>19.0.0</td>
                <td><a href="https://github.com/ROCm/llvm-project/"><i
                            class="fab fa-github fa-lg"></i></a></td>
@@ -402,12 +398,12 @@ Click {fab}`github` to go to the component's source code on GitHub.
        <tbody class="rocm-components-runtimes tbody-reverse-zebra">
            <tr>
                <th rowspan="2" colspan="2">Runtimes</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIP/en/docs-6.4.1/index.html">HIP</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/HIP/en/docs-6.4.0/index.html">HIP</a></td>
                <td>6.4.0&nbsp;&Rightarrow;&nbsp;<a href="#hip-6-4-1">6.4.1</td>
                <td><a href="https://github.com/ROCm/HIP/"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCR-Runtime/en/docs-6.4.1/index.html">ROCr Runtime</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCR-Runtime/en/docs-6.4.0/index.html">ROCr Runtime</a></td>
                <td>1.15.0&nbsp;&Rightarrow;&nbsp;<a href="#rocr-runtime-1-15-0">1.15.0</td>
                <td><a href="https://github.com/ROCm/ROCR-Runtime/"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -430,24 +426,11 @@ For a historical overview of ROCm component updates, see the {doc}`ROCm consolid
 * Dumping CPER entries from RAS tool `amdsmi_get_gpu_cper_entries()` to Python and C APIs.
  - Dumping CPER entries consist of `amdsmi_cper_hdr_t`.
  - Dumping CPER entries is also enabled in the CLI interface through `sudo amd-smi ras --cper`.
-* `amdsmi_get_gpu_busy_percent` to the C API.

-#### Changed
-
-* Modified VRAM display for amd-smi monitor -v. 
-
-#### Optimized
-
-* Improved load times for CLI commands when the GPU has multiple parititons.
-
-#### Resolved issues
+#### Resolved

 * Fixed partition enumeration in `amd-smi list -e`, `amdsmi_get_gpu_enumeration_info()`, `amdsmi_enumeration_info_t`, `drm_card`, and `drm_render` fields.

-#### Known issues
-
-* When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.
-
 ```{note}
 See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
 ```
@@ -456,22 +439,20 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc

 #### Added

-* New log mask enumeration `LOG_COMGR` enables logging precise code object information.
+* New debug mask, to print precise code object information for logging.

 #### Changed

-* HIP runtime uses device bitcode before SPIRV.
-* The implementation of preventing `hipLaunchKernel` latency degradation with number of idle streams is reverted/disabled by default.
+* Calling the code object has changed. HIP runtime now uses device bitcode before SPIR-V.

 #### Optimized

-* Improved kernel logging includes de-mangling shader names.
-* Refined implementation in HIP APIs `hipEventRecords` and `hipStreamWaitEvent` for performance improvement.
+* Improved kernel logging using the demangling shader names.

 #### Resolved issues

-* Stale state during the graph capture. The return error was fixed, HIP runtime now always uses the latest dependent nodes during `hipEventRecord` capture.
-* Segmentation fault during kernel execution. HIP runtime now allows maximum stack size as per ISA on the GPU device.
+* Stale state during the graph capture. The return error was fixed, and HIP runtime now always uses the latest dependent nodes during `hipEventRecord` capture.
+* Issue of `hipEventRecords` failing to call the `hip::getStream` runtime function.

 ### **hipBLASLt** (0.12.1)

@@ -490,16 +471,6 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
 * Fixed an issue where early termination, in rare circumstances, could cause the application to stop responding by adding synchronization before destroying a proxy thread.
 * Fixed the accuracy issue for the MSCCLPP `allreduce7` kernel in graph mode.

-#### Known issues
-
-* When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault. The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
-  This issue will be fixed in a future ROCm release.
-
-* Within the RCCL-UnitTests test suite, failures occur in tests ending with the
-  `.ManagedMem` and `.ManagedMemGraph` suffixes. These failures only affect the
-  test results and do not affect the RCCL component itself. This issue will be
-  resolved in a future ROCm release.
-
 ### **rocALUTION** (3.2.3)

 #### Added
@@ -539,7 +510,7 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/rele

 #### Added 

-* How-to document for [network performance profiling](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/nic-profiling.html) for standard Network Interface Cards (NICs).
+* How-to document for [network performance profiling](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/amd-staging/how-to/nic-profiling.html) for standard Network Interface Cards (NICs).

 #### Resolved issues

@@ -556,35 +527,6 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/rele
 ROCm known issues are noted on {fab}`github` [GitHub](https://github.com/ROCm/ROCm/labels/Verified%20Issue). For known
 issues related to individual components, review the [Detailed component changes](#detailed-component-changes).

-### Radeon AI PRO R9700 hangs when running Stable Diffusion 2.1 at batch sizes above four
-
-Radeon AI PRO R9700 GPUs might hang when running [Stable Diffusion
-2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1) with batch sizes
-greater than four. As a workaround, limit batch sizes to four or fewer. This issue
-will be addressed in a future ROCm release. See [issue #4770](https://github.com/ROCm/ROCm/issues/4770) on GitHub.
-
-### RCCL MSCCL initialization failure
-
-When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault. The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
-This issue will be fixed in a future ROCm release. See [issue #4769](https://github.com/ROCm/ROCm/issues/4769) on GitHub.
-
-### AMD SMI CLI: CPER entries not dumped continuously when using follow flag
-
-* When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.
-See [issue #4768](https://github.com/ROCm/ROCm/issues/4768) on GitHub.
-
-### ROCm SMI uninstallation issue on RHEL and SLES
-
-`rocm-smi-lib` does not get uninstalled and remains orphaned on RHEL and SLES systems when:
-
-* [Uninstalling ROCm using the AMDGPU installer](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html#uninstalling-rocm) with `amdgpu-install --uninstall`
-
-* [Uninstalling via package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/install-methods/package-manager/package-manager-rhel.html#uninstall-rocm-packages)
-  with `dnf remove rocm-core` on RHEL or `zypper remove rocm-core` on SLES.
-
-As a workaround, manually remove the `rocm-smi-lib` package using `sudo dnf remove rocm-smi-lib` or `sudo zypper remove rocm-smi-lib`.
-See [issue #4767](https://github.com/ROCm/ROCm/issues/4767) on GitHub.
-
 ## ROCm upcoming changes

 The following changes to the ROCm software stack are anticipated for future releases.
@@ -654,4 +596,4 @@ There are a number of upcoming changes planned for HIP runtime API in an upcomin
 that are not backward compatible with prior releases. Most of these changes increase 
 alignment between HIP and CUDA APIs or behavior. Some of the upcoming changes are to 
 clean up header files, remove namespace collision, and have a clear separation between 
-`hipRTC` and HIP runtime. For more information, see [HIP 7.0 Is Coming: What You Need to Know to Stay Ahead](https://rocm.blogs.amd.com/ecosystems-and-partners/transition-to-hip-7.0:-guidance-on-upcoming-compatibility-changes/README.html).
+`hipRTC` and HIP runtime. For more information refer to [HIP Upcoming changes](https://rocm.docs.amd.com/en/latest/about/release-notes.html#id15).
--- a/default.xml
+++ b/default.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.1"
+    <default revision="refs/tags/rocm-6.4.0"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -2,25 +2,21 @@ ROCm Version,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5,
      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
      ,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,"RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
      ,SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
      ,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
      ,"Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
-      ,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
-      ,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
+,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
+,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA4,,,,,,,,,,,,,,,
      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
@@ -42,7 +38,7 @@ ROCm Version,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5,
      CUB,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
 ,,,,,,,,,,,,,,,,
      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      KMD versions,"6.4.x, 6.3.x","6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
      ,,,,,,,,,,,,,,,,
      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
@@ -57,7 +53,7 @@ ROCm Version,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5,
      ,,,,,,,,,,,,,,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-      :doc:`rocSHMEM <rocshmem:index>`,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      `rocSHMEM <https://github.com/ROCm/rocSHMEM>`_ ,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      ,,,,,,,,,,,,,,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
@@ -115,9 +111,9 @@ ROCm Version,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5,
      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25172,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25172,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25172,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
 ,,,,,,,,,,,,,,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -28,7 +28,7 @@ compatibility and system requirements.

      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2
      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
-      ,"RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4"
+      ,"RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4"
      ,RHEL 8.10,RHEL 8.10,RHEL 8.10
      ,SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5"
      ,"Oracle Linux 9, 8 [#mi300x]_","Oracle Linux 9, 8 [#mi300x]_",Oracle Linux 8.10 [#mi300x]_
@@ -38,14 +38,10 @@ compatibility and system requirements.
      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
      ,CDNA2,CDNA2,CDNA2
      ,CDNA,CDNA,CDNA
-      ,RDNA4,,
      ,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix:,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS]_,,
-      ,gfx1200 [#RDNA-OS]_,,
-      ,gfx1101 [#RDNA-OS]_,,
-      ,gfx1100,gfx1100,gfx1100
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100
      ,gfx1030,gfx1030,gfx1030
      ,gfx942,gfx942,gfx942
      ,gfx90a,gfx90a,gfx90a
@@ -66,7 +62,7 @@ compatibility and system requirements.
      CUB,2.5.0,2.5.0,2.3.2
      ,,,
      KMD & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
-      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
+      KMD versions,"6.4.x, 6.3.x","6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
      ,,,
      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
@@ -81,7 +77,7 @@ compatibility and system requirements.
      ,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5
-      :doc:`rocSHMEM <rocshmem:index>`,2.0.0,2.0.0,N/A
+      `rocSHMEM <https://github.com/ROCm/rocSHMEM>`_ ,2.0.0,2.0.0,N/A
      ,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
@@ -139,9 +135,9 @@ compatibility and system requirements.
      COMPILERS,.. _compilers-support-compatibility-matrix:,,
      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25184,19.0.0.25133,18.0.0.24455
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25184,19.0.0.25133,18.0.0.24491
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25184,19.0.0.25133,18.0.0.24491
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25172,19.0.0.25133,18.0.0.24455
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25172,19.0.0.25133,18.0.0.24491
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25172,19.0.0.25133,18.0.0.24491
      ,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43483,6.4.43482,6.3.42131
@@ -157,7 +153,6 @@ compatibility and system requirements.
 .. [#mi300_620] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
 .. [#kfd_support] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
-.. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.

 .. _OS-kernel-versions:

@@ -175,8 +170,7 @@ Use this lookup table to confirm which operating system and kernel versions are
   ,,
   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 GA, 6.8 HWE", 2.35
   ,,
-   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14+, 2.34
-   , 9.5, 5.14+, 2.34
+   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.5, 5.14+, 2.34
   ,9.4, 5.14+, 2.34
   ,9.3, 5.14+, 2.34
   ,,
@@ -237,4 +231,3 @@ Expand for full historical view of:
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#kfd_support-past-60] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
-   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -14,18 +14,17 @@ JAX provides a NumPy-like API, which combines automatic differentiation and the
 Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
 learning at scale.

-JAX uses composable transformations of Python and NumPy through just-in-time
-(JIT) compilation, automatic vectorization, and parallelization. To learn about
-JAX, including profiling and optimizations, see the official `JAX documentation
+JAX uses composable transformations of Python and NumPy through just-in-time (JIT) compilation,
+automatic vectorization, and parallelization. To learn about JAX, including profiling and
+optimizations, see the official `JAX documentation
 <https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.

-ROCm support for JAX is upstreamed, and users can build the official source code
-with ROCm support:
+ROCm support for JAX is upstreamed and users can build the official source code with ROCm
+support:

 - ROCm JAX release:

-  - Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
-    with ROCm and JAX preinstalled.
+  - Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>` with ROCm and JAX pre-installed.

  - ROCm JAX repository: `ROCm/jax <https://github.com/ROCm/jax>`_

@@ -37,8 +36,8 @@ with ROCm support:
  - Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_

  - See the `AMD GPU (Linux) installation section
-    <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
-    the JAX documentation.
+    <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in the JAX
+    documentation.

 .. note::

@@ -47,44 +46,6 @@ with ROCm support:
   `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
   follow upstream JAX releases and use the latest available ROCm version.

-Use cases and recommendations
-================================================================================
-
-* The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
-  blog explores the implementation and training of a Generative Pre-trained
-  Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s JAX-based
-  nanoGPT. Comparing how essential GPT components—such as self-attention
-  mechanisms and optimizers—are realized in JAX and JAX, also highlights
-  JAX’s unique features.
-
-* The `Optimize GPT Training: Enabling Mixed Precision Training in JAX using
-  ROCm on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-mixed-precision/README.html>`_
-  blog post provides a comprehensive guide on enhancing the training efficiency
-  of GPT models by implementing mixed precision techniques in JAX, specifically
-  tailored for AMD GPUs utilizing the ROCm platform.
-
-* The `Supercharging JAX with Triton Kernels on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-triton/README.html>`_
-  blog demonstrates how to develop a custom fused dropout-activation kernel for
-  matrices using Triton, integrate it with JAX, and benchmark its performance
-  using ROCm.
-
-* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
-  outlines the process of fine-tuning a Bidirectional Encoder Representations
-  from Transformers (BERT)-based large language model (LLM) using JAX for a text
-  classification task. The blog post discuss techniques for parallelizing the
-  fine-tuning across multiple AMD GPUs and assess the model's performance on a
-  holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
-  and the General Language Understanding Evaluation (GLUE) benchmark dataset was
-  used on a multi-GPU setup.
-
-* The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
-  provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
-  accelerator using ROCm. The page is aimed at helping users achieve optimal
-  performance for deep learning and other high-performance computing tasks on
-  the MI300X GPU.
-
-For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.blogs.amd.com/blog/tag/jax.html>`_.
-
 .. _jax-docker-compat:

 Docker image compatibility
@@ -96,8 +57,8 @@ Docker image compatibility

 AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
 with ROCm backends on Docker Hub. The following Docker image tags and
-associated inventories represent the latest JAX version from the official Docker Hub and are validated for
-`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
+associated inventories are validated for
+`ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_. Click the |docker-icon|
 icon to view the image on Docker Hub.

 .. list-table:: JAX Docker image components
@@ -110,19 +71,19 @@ icon to view the image on Docker Hub.

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.1-jax0.4.35-py3.12/images/sha256-7a0745a2a2758bdf86397750bac00e9086cbf67d170cfdbb08af73f7c7d18a6a"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4-jax0.4.35-py3.12/images/sha256-4069398229078f3311128b6d276c6af377c7e97d3363d020b0bf7154fae619ca"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>

      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
      - Ubuntu 24.04
-      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `3.12.7 <https://www.python.org/downloads/release/python-3127/>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.1-jax0.4.35-py3.10/images/sha256-5f9e8d6e6e69fdc9a1a3f2ba3b1234c3f46c53b7468538c07fd18b00899da54f"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4-jax0.4.35-py3.10/images/sha256-a137f901f91ce6c13b424c40a6cf535248d4d20fd36d5daf5eee0570190a4a11"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>

      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
      - Ubuntu 22.04
-      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `3.10.14 <https://www.python.org/downloads/release/python-31014/>`_

 AMD publishes `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
 with ROCm backends on Docker Hub. The following Docker image tags and
@@ -160,14 +121,13 @@ associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/
      - Ubuntu 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_

-.. _key_rocm_libraries:
-
-Key ROCm libraries for JAX
+Critical ROCm libraries for JAX
 ================================================================================

-The following ROCm libraries represent potential targets that could be utilized
-by JAX on ROCm for various computational tasks. The actual libraries used will
-depend on the specific implementation and operations performed.
+The functionality of JAX with ROCm is determined by its underlying library
+dependencies. These critical ROCm components affect the capabilities,
+performance, and feature set available to developers. The versions described
+are available in ROCm :version:`rocm_version`.

 .. list-table::
    :header-rows: 1
@@ -175,140 +135,539 @@ depend on the specific implementation and operations performed.
    * - ROCm library
      - Version
      - Purpose
+      - Used in
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
      - :version-ref:`hipBLAS rocm_version`
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
+      - Matrix multiplication in ``jax.numpy.matmul``, ``jax.lax.dot`` and
+        ``jax.lax.dot_general``, operations like ``jax.numpy.dot``, which
+        involve vector and matrix computations and batch matrix multiplications
+        ``jax.numpy.einsum`` with matrix-multiplication patterns algebra
+        operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
      - :version-ref:`hipBLASLt rocm_version`
      - hipBLASLt is an extension of hipBLAS, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
+      - Matrix multiplication in ``jax.numpy.matmul`` or ``jax.lax.dot``, and
+        the XLA (Accelerated Linear Algebra) use hipBLASLt for optimized matrix
+        operations, mixed-precision support, and hardware-specific
+        optimizations.
    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
      - :version-ref:`hipCUB rocm_version`
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
+      - Reduction functions (``jax.numpy.sum``, ``jax.numpy.mean``,
+        ``jax.numpy.prod``, ``jax.numpy.max`` and ``jax.numpy.min``), prefix sum
+        (``jax.numpy.cumsum``, ``jax.numpy.cumprod``) and sorting
+        (``jax.numpy.sort``, ``jax.numpy.argsort``).
    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
      - :version-ref:`hipFFT rocm_version`
      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
+      - Used in functions like ``jax.numpy.fft``.
    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
      - :version-ref:`hipRAND rocm_version`
      - Provides fast random number generation for GPUs.
+      - The ``jax.random.uniform``, ``jax.random.normal``,
+        ``jax.random.randint`` and ``jax.random.split``.
    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
      - :version-ref:`hipSOLVER rocm_version`
      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
        singular value decompositions (SVD).
+      - Solving linear systems (``jax.numpy.linalg.solve``), matrix
+        factorizations, SVD (``jax.numpy.linalg.svd``) and eigenvalue problems
+        (``jax.numpy.linalg.eig``).
    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
      - :version-ref:`hipSPARSE rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
+      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
+        matrix-vector and matrix-matrix products
+        (``jax.experimental.sparse.dot``), sparse linear system solvers and
+        sparse data handling.
    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
      - :version-ref:`hipSPARSELt rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
+      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
+        matrix-vector and matrix-matrix products
+        (``jax.experimental.sparse.dot``) and sparse linear system solvers.
    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
      - :version-ref:`MIOpen rocm_version`
      - Optimized for deep learning primitives such as convolutions, pooling,
        normalization, and activation functions.
+      - Speeds up convolutional neural networks (CNNs), recurrent neural
+        networks (RNNs), and other layers. Used in operations like
+        ``jax.nn.conv``, ``jax.nn.relu``, and ``jax.nn.batch_norm``.
    * - `RCCL <https://github.com/ROCm/rccl>`_
      - :version-ref:`RCCL rocm_version`
      - Optimized for multi-GPU communication for operations like  all-reduce,
        broadcast, and scatter.
+      - Distribute computations across multiple GPU with ``pmap`` and
+        ``jax.distributed``. XLA automatically uses rccl when executing
+        operations across multiple GPUs on AMD hardware.
    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
      - :version-ref:`rocThrust rocm_version`
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
+      - Reduction operations like ``jax.numpy.sum``, ``jax.pmap`` for
+        distributed training, which involves parallel reductions or
+        operations like ``jax.numpy.cumsum`` can use rocThrust.

-.. note::
-
-    This table shows ROCm libraries that could potentially be utilized by JAX. Not
-    all libraries may be used in every configuration, and the actual library usage
-    will depend on the specific operations and implementation details.
-
-Supported data types and modules
+Supported and unsupported features
 ===============================================================================

-The following tables lists the supported public JAX API data types and modules.
-
-Supported data types
--------------------------------------------------------------------------------
-
-ROCm supports all the JAX data types of `jax.dtypes <https://docs.jax.dev/en/latest/jax.dtypes.html>`_
-module, `jax.numpy.dtype <https://docs.jax.dev/en/latest/_autosummary/jax.numpy.dtype.html>`_
-and `default_dtype <https://docs.jax.dev/en/latest/default_dtypes.html>`_ .
-The ROCm supported data types in JAX are collected in the following table.
+The following table maps GPU-accelerated JAX modules to their supported
+ROCm and JAX versions.

 .. list-table::
    :header-rows: 1

-    * - Data type
+    * - Module
      - Description
+      - Since JAX
+      - Since ROCm
+    * - ``jax.numpy``
+      - Implements the NumPy API, using the primitives in ``jax.lax``.
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy``
+      - Provides GPU-accelerated and differentiable implementations of many
+        functions from the SciPy library, leveraging JAX's transformations
+        (e.g., ``grad``, ``jit``, ``vmap``).
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.lax``
+      - A library of primitives operations that underpins libraries such as
+        ``jax.numpy.`` Transformation rules, such as Jacobian-vector product
+        (JVP) and batching rules, are typically defined as transformations on
+        ``jax.lax`` primitives.
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.random``
+      - Provides a number of routines for deterministic generation of sequences
+        of pseudorandom numbers.
+      - 0.1.58
+      - 5.0.0
+    * - ``jax.sharding``
+      - Allows to define partitioning and distributing arrays across multiple
+        devices.
+      - 0.3.20
+      - 5.1.0
+    * - ``jax.dlpack``
+      - For exchanging tensor data between JAX and other libraries that support the
+        DLPack standard.
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.distributed``
+      - Enables the scaling of computations across multiple devices on a single
+        machine or across multiple machines.
+      - 0.1.74
+      - 5.0.0
+    * - ``jax.dtypes``
+      - Provides utilities for working with and managing data types in JAX
+        arrays and computations.
+      - 0.1.66
+      - 5.0.0
+    * - ``jax.image``
+      - Contains image manipulation functions like resize, scale and translation.
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.nn``
+      - Contains common functions for neural network libraries.
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.ops``
+      - Computes the minimum, maximum, sum or product within segments of an
+        array.
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.profiler``
+      - Contains JAX’s tracing and time profiling features.
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.stages``
+      - Contains interfaces to stages of the compiled execution process.
+      - 0.3.4
+      - 5.0.0
+    * - ``jax.tree``
+      - Provides utilities for working with tree-like container data structures.
+      - 0.4.26
+      - 5.6.0
+    * - ``jax.tree_util``
+      - Provides utilities for working with nested data structures, or
+        ``pytrees``.
+      - 0.1.65
+      - 5.0.0
+    * - ``jax.typing``
+      - Provides JAX-specific static type annotations.
+      - 0.3.18
+      - 5.1.0
+    * - ``jax.extend``
+      - Provides modules for access to JAX internal machinery module. The
+        ``jax.extend`` module defines a library view of some of JAX’s internal
+        components.
+      - 0.4.15
+      - 5.5.0
+    * - ``jax.example_libraries``
+      - Serves as a collection of example code and libraries that demonstrate
+        various capabilities of JAX.
+      - 0.1.74
+      - 5.0.0
+    * - ``jax.experimental``
+      - Namespace for experimental features and APIs that are in development or
+        are not yet fully stable for production use.
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.lib``
+      - Set of internal tools and types for bridging between JAX’s Python
+        frontend and its XLA backend.
+      - 0.4.6
+      - 5.3.0
+    * - ``jax_triton``
+      - Library that integrates the Triton deep learning compiler with JAX.
+      - jax_triton 0.2.0
+      - 6.2.4

-    * - ``bfloat16``
-      - 16-bit bfloat (brain floating point).
+jax.scipy module
+-------------------------------------------------------------------------------

-    * - ``bool``
-      - Boolean.
+A SciPy-like API for scientific computing.

-    * - ``complex128``
-      - 128-bit complex.
+.. list-table::
+    :header-rows: 1

-    * - ``complex64``
-      - 64-bit complex.
+    * - Module
+      - Since JAX
+      - Since ROCm
+    * - ``jax.scipy.cluster``
+      - 0.3.11
+      - 5.1.0
+    * - ``jax.scipy.fft``
+      - 0.1.71
+      - 5.0.0
+    * - ``jax.scipy.integrate``
+      - 0.4.15
+      - 5.5.0
+    * - ``jax.scipy.interpolate``
+      - 0.1.76
+      - 5.0.0
+    * - ``jax.scipy.linalg``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.ndimage``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.optimize``
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.scipy.signal``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.spatial.transform``
+      - 0.4.12
+      - 5.4.0
+    * - ``jax.scipy.sparse.linalg``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.special``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.stats``
+      - 0.1.56
+      - 5.0.0

-    * - ``float16``
-      - 16-bit (half precision) floating-point.
+jax.scipy.stats module
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-    * - ``float32``
-      - 32-bit (single precision) floating-point.
+.. list-table::
+   :header-rows: 1

-    * - ``float64``
-      - 64-bit (double precision) floating-point.
+   * - Module
+     - Since JAX
+     - Since ROCm
+   * - ``jax.scipy.stats.bernouli``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.beta``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.betabinom``
+     - 0.1.61
+     - 5.0.0
+   * - ``jax.scipy.stats.binom``
+     - 0.4.14
+     - 5.4.0
+   * - ``jax.scipy.stats.cauchy``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.chi2``
+     - 0.1.61
+     - 5.0.0
+   * - ``jax.scipy.stats.dirichlet``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.expon``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.gamma``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.gennorm``
+     - 0.3.15
+     - 5.2.0
+   * - ``jax.scipy.stats.geom``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.laplace``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.logistic``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.multinomial``
+     - 0.3.18
+     - 5.1.0
+   * - ``jax.scipy.stats.multivariate_normal``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.nbinom``
+     - 0.1.72
+     - 5.0.0
+   * - ``jax.scipy.stats.norm``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.pareto``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.poisson``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.t``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.truncnorm``
+     - 0.4.0
+     - 5.3.0
+   * - ``jax.scipy.stats.uniform``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.vonmises``
+     - 0.4.2
+     - 5.3.0
+   * - ``jax.scipy.stats.wrapcauchy``
+     - 0.4.20
+     - 5.6.0

-    * - ``half``
-      - 16-bit (half precision) floating-point.
+jax.extend module
+-------------------------------------------------------------------------------

-    * - ``int16``
-      - Signed 16-bit integer.
+Modules for JAX extensions.

-    * - ``int32``
-      - Signed 32-bit integer.
+.. list-table::
+    :header-rows: 1

-    * - ``int64``
-      - Signed 64-bit integer.
+    * - Module
+      - Since JAX
+      - Since ROCm
+    * - ``jax.extend.ffi``
+      - 0.4.30
+      - 6.0.0
+    * - ``jax.extend.linear_util``
+      - 0.4.17
+      - 5.6.0
+    * - ``jax.extend.mlir``
+      - 0.4.26
+      - 5.6.0
+    * - ``jax.extend.random``
+      - 0.4.15
+      - 5.5.0

-    * - ``int8``
-      - Signed 8-bit integer.
+jax.experimental module
+-------------------------------------------------------------------------------

-    * - ``uint16``
-      - Unsigned 16-bit (word) integer.
+Experimental modules and APIs.

-    * - ``uint32``
-      - Unsigned 32-bit (dword) integer.
+.. list-table::
+    :header-rows: 1

-    * - ``uint64``
-      - Unsigned 64-bit (qword) integer.
+    * - Module
+      - Since JAX
+      - Since ROCm
+    * - ``jax.experimental.checkify``
+      - 0.1.75
+      - 5.0.0
+    * - ``jax.experimental.compilation_cache.compilation_cache``
+      - 0.1.68
+      - 5.0.0
+    * - ``jax.experimental.custom_partitioning``
+      - 0.4.0
+      - 5.3.0
+    * - ``jax.experimental.jet``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.experimental.key_reuse``
+      - 0.4.26
+      - 5.6.0
+    * - ``jax.experimental.mesh_utils``
+      - 0.1.76
+      - 5.0.0
+    * - ``jax.experimental.multihost_utils``
+      - 0.3.2
+      - 5.0.0
+    * - ``jax.experimental.pallas``
+      - 0.4.15
+      - 5.5.0
+    * - ``jax.experimental.pjit``
+      - 0.1.61
+      - 5.0.0
+    * - ``jax.experimental.serialize_executable``
+      - 0.4.0
+      - 5.3.0
+    * - ``jax.experimental.shard_map``
+      - 0.4.3
+      - 5.3.0
+    * - ``jax.experimental.sparse``
+      - 0.1.75
+      - 5.0.0

-    * - ``uint8``
-      - Unsigned 8-bit (byte) integer.
+.. list-table::
+    :header-rows: 1

-.. note::
+    * - API
+      - Since JAX
+      - Since ROCm
+    * - ``jax.experimental.enable_x64``
+      - 0.1.60
+      - 5.0.0
+    * - ``jax.experimental.disable_x64``
+      - 0.1.60
+      - 5.0.0

-  JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
-  collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
-  page.
+jax.experimental.pallas module
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Supported modules
--------------------------------------------------------------------------------
+Module for Pallas, a JAX extension for custom kernels.

-For a complete and up-to-date list of JAX public modules (for example, ``jax.numpy``,
-``jax.scipy``, ``jax.lax``), their descriptions, and usage, please refer directly to the
-`official JAX API documentation <https://jax.readthedocs.io/en/latest/jax.html>`_.
+.. list-table::
+    :header-rows: 1

-.. note::
+    * - Module
+      - Since JAX
+      - Since ROCm
+    * - ``jax.experimental.pallas.mosaic_gpu``
+      - 0.4.31
+      - 6.1.3
+    * - ``jax.experimental.pallas.tpu``
+      - 0.4.15
+      - 5.5.0
+    * - ``jax.experimental.pallas.triton``
+      - 0.4.32
+      - 6.1.3

-  Since version 0.1.56, JAX has full support for ROCm, and the
-  :ref:`Known issues and important notes <jax_comp_known_issues>` section
-  contains details about limitations specific to the ROCm backend. The list of
-  JAX API modules is maintained by the JAX project and is subject to change. 
-  Refer to the official Jax documentation for the most up-to-date information.
+jax.experimental.sparse module
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Experimental support for sparse matrix operations.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Module
+      - Since JAX
+      - Since ROCm
+    * - ``jax.experimental.sparse.linalg``
+      - 0.3.15
+      - 5.2.0
+    * - ``jax.experimental.sparse.sparsify``
+      - 0.3.25
+      - ❌
+
+.. list-table::
+    :header-rows: 1
+
+    * - ``sparse`` data structure API
+      - Since JAX
+      - Since ROCm
+    * - ``jax.experimental.sparse.BCOO``
+      - 0.1.72
+      - 5.0.0
+    * - ``jax.experimental.sparse.BCSR``
+      - 0.3.20
+      - 5.1.0
+    * - ``jax.experimental.sparse.CSR``
+      - 0.1.75
+      - 5.0.0
+    * - ``jax.experimental.sparse.NM``
+      - 0.4.27
+      - 5.6.0
+    * - ``jax.experimental.sparse.COO``
+      - 0.1.75
+      - 5.0.0
+
+Unsupported JAX features
+------------------------
+
+The following are GPU-accelerated JAX features not currently supported by
+ROCm.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - Since JAX
+    * - Mixed Precision with TF32
+      - Mixed precision with TF32 is used for matrix multiplications,
+        convolutions, and other linear algebra operations, particularly in
+        deep learning workloads like CNNs and transformers.
+      - 0.2.25
+    * - RNN support
+      - Currently only LSTM with double bias is supported with float32 input
+        and weight.
+      - 0.3.25
+    * - XLA int4 support
+      - 4-bit integer (int4) precision in the XLA compiler.
+      - 0.4.0
+    * - ``jax.experimental.sparsify``
+      - Converts a dense matrix to a sparse matrix representation.
+      - Experimental
+
+Use cases and recommendations
+================================================================================
+
+* The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
+  blog explores the implementation and training of a Generative Pre-trained
+  Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s PyTorch-based
+  nanoGPT. By comparing how essential GPT components—such as self-attention
+  mechanisms and optimizers—are realized in PyTorch and JAX, also highlight
+  JAX’s unique features.
+
+* The `Optimize GPT Training: Enabling Mixed Precision Training in JAX using
+  ROCm on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-mixed-precision/README.html>`_
+  blog post provides a comprehensive guide on enhancing the training efficiency
+  of GPT models by implementing mixed precision techniques in JAX, specifically
+  tailored for AMD GPUs utilizing the ROCm platform.
+
+* The `Supercharging JAX with Triton Kernels on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-triton/README.html>`_
+  blog demonstrates how to develop a custom fused dropout-activation kernel for
+  matrices using Triton, integrate it with JAX, and benchmark its performance
+  using ROCm.
+
+* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
+  outlines the process of fine-tuning a Bidirectional Encoder Representations
+  from Transformers (BERT)-based large language model (LLM) using JAX for a text
+  classification task. The blog post discuss techniques for parallelizing the
+  fine-tuning across multiple AMD GPUs and assess the model's performance on a
+  holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
+  and the General Language Understanding Evaluation (GLUE) benchmark dataset was
+  used on a multi-GPU setup.
+
+* The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
+  provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
+  accelerator using ROCm. The page is aimed at helping users achieve optimal
+  performance for deep learning and other high-performance computing tasks on
+  the MI300X GPU.
+
+For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.blogs.amd.com/blog/tag/jax.html>`_.
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -95,7 +95,7 @@ Docker image compatibility

 AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`_
 with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories were tested on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_.
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: PyTorch Docker image components
@@ -116,122 +116,137 @@ Click |docker-icon| to view the image on Docker Hub.

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-c76af9bfb1c25b0f40d4c29e8652105c57250bf018d23ff595b06bd79666fdd7"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-ab1d350b818b90123cfda31363019d11c0d41a8f12a19e3cb2cb40cf0261137d"><i class="fab fa-docker fa-lg"></i></a>

      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
      - 24.04
-      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`_
      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.16.0 <https://github.com/openucx/ucx/tree/v1.16.0>`_
-      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-f9d226135d51831c810dcb1251636ec61f85c65fcdda03e188c053a5d4f6585b"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-130536fdfceb374626a7bcb8d00b9d796ddfc3115677d51229e5b852d96b5ef4"><i class="fab fa-docker fa-lg"></i></a>

      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
      - 22.04
-      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`_
      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
-      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-3490e74d4f43dcdb3351dd334108d1ccd47e5a687c0523a2424ac1bcdd3dd6dd"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-20a2e24b4738dc1f1a44a04f23827918b56c99f7e697e6fccb90e9c4fae8ca9b"><i class="fab fa-docker fa-lg"></i></a>

      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
      - 24.04
-      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-26c5dfffb4a54625884abca83166940f17dd27bc75f1b24f6e80fbcb7d4e9afb"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.11_pytorch_release_2.5.1/images/sha256-f09cb8ca39cc39222fb554060711f5c19130f7b4047aaf41fad4ba3ec470ca03"><i class="fab fa-docker fa-lg"></i></a>

      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
      - 22.04
-      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `3.11.9 <https://www.python.org/downloads/release/python-3119/>`_
      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
-      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
+      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
+      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-f378a24561fa6efc178b6dc93fc7d82e5b93653ecd59c89d4476674d29e1284d"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-a91c100d1fe608dae3eb7f60a751630363d4027ac3d077d428e92945204c338e"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
+      - 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
+      - `master <https://bitbucket.org/icl/magma/src/master/>`_
+      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
+      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
+      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-66a89ce6485bb887af74bb9bd76bb613ab9834a6b1374649ea7ae379883454a4"><i class="fab fa-docker fa-lg"></i></a>

      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 24.04
-      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`_
-      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-2308dbd0e650b7bf8d548575cbb6e2bdc021f9386384ce570da16d58ee684d22"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-c716cf167e6e49893f11de03606ed37044153aca089e74ca615065c06877f86b"><i class="fab fa-docker fa-lg"></i></a>

      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 22.04
-      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
-      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
+      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
+      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-eefd2ab019728f91f94c5e6a9463cb0ea900b3011458d18fe5d88e50c0b57d86"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-0434cbc9b07b2c26e39480d7447f676f9057a1054dcff00e0050c25a6eddbd3c"><i class="fab fa-docker fa-lg"></i></a>

      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
      - 24.04
-      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`_
-      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-473643226ab0e93a04720b256ed772619878abf9c42b9f84828cefed522696fd"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-688b1c0073092615fb98778d78b16191e506097ee116a2d3d2628b264d5d367b"><i class="fab fa-docker fa-lg"></i></a>

      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
      - 22.04
-      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
-      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_

 Key ROCm libraries for PyTorch
@@ -372,15 +387,24 @@ feature set available to developers.
        involve matrix products, such as ``torch.matmul``, ``torch.bmm``, and
        more.

-Supported modules and data types
+Supported features
 ================================================================================

-The following section outlines the supported data types, modules, and domain libraries available in PyTorch on ROCm.
+This section maps GPU-accelerated PyTorch features to their supported ROCm and
+PyTorch versions.

-Supported data types
+torch
 --------------------------------------------------------------------------------

-The tensor data type is specified using the ``dtype`` attribute or argument.
+`torch <https://pytorch.org/docs/stable/index.html>`_ is the central module of
+PyTorch, providing data structures for multi-dimensional tensors and
+implementing mathematical operations on them. It also includes utilities for
+efficient serialization of tensors and arbitrary data types and other tools.
+
+Tensor data types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The tensor data type is specified using the ``dtype`` attribute or argument. 
 PyTorch supports many data types for different use cases.

 The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`_
@@ -391,154 +415,539 @@ single data types:

    * - Data type
      - Description
+      - As of PyTorch
+      - As of ROCm
    * - ``torch.float8_e4m3fn``
      - 8-bit floating point, e4m3
+      - 2.3
+      - 5.5
    * - ``torch.float8_e5m2``
      - 8-bit floating point, e5m2
+      - 2.3
+      - 5.5
    * - ``torch.float16`` or ``torch.half``
      - 16-bit floating point
+      - 0.1.6
+      - 2.0
    * - ``torch.bfloat16``
      - 16-bit floating point
+      - 1.6
+      - 2.6
    * - ``torch.float32`` or ``torch.float``
      - 32-bit floating point
+      - 0.1.12_2
+      - 2.0
    * - ``torch.float64`` or ``torch.double``
      - 64-bit floating point
+      - 0.1.12_2
+      - 2.0
    * - ``torch.complex32`` or ``torch.chalf``
-      - 32-bit complex numbers
+      - PyTorch provides native support for 32-bit complex numbers
+      - 1.6
+      - 2.0
    * - ``torch.complex64`` or ``torch.cfloat``
-      - 64-bit complex numbers
+      - PyTorch provides native support for 64-bit complex numbers
+      - 1.6
+      - 2.0
    * - ``torch.complex128`` or ``torch.cdouble``
-      - 128-bit complex numbers
+      - PyTorch provides native support for 128-bit complex numbers
+      - 1.6
+      - 2.0
    * - ``torch.uint8``
      - 8-bit integer (unsigned)
+      - 0.1.12_2
+      - 2.0
    * - ``torch.uint16``
-      - 16-bit integer (unsigned);
-        Not natively supported in ROCm
+      - 16-bit integer (unsigned)
+      - 2.3
+      - Not natively supported
    * - ``torch.uint32``
-      - 32-bit integer (unsigned);
-        Not natively supported in ROCm
+      - 32-bit integer (unsigned)
+      - 2.3
+      - Not natively supported
    * - ``torch.uint64``
-      - 64-bit integer (unsigned);
-        Not natively supported in ROCm
+      - 32-bit integer (unsigned)
+      - 2.3
+      - Not natively supported
    * - ``torch.int8``
      - 8-bit integer (signed)
+      - 1.12
+      - 5.0
    * - ``torch.int16`` or ``torch.short``
      - 16-bit integer (signed)
+      - 0.1.12_2
+      - 2.0
    * - ``torch.int32`` or ``torch.int``
      - 32-bit integer (signed)
+      - 0.1.12_2
+      - 2.0
    * - ``torch.int64`` or ``torch.long``
      - 64-bit integer (signed)
+      - 0.1.12_2
+      - 2.0
    * - ``torch.bool``
      - Boolean
+      - 1.2
+      - 2.0
    * - ``torch.quint8``
      - Quantized 8-bit integer (unsigned)
+      - 1.8
+      - 5.0
    * - ``torch.qint8``
      - Quantized 8-bit integer (signed)
+      - 1.8
+      - 5.0
    * - ``torch.qint32``
      - Quantized 32-bit integer (signed)
+      - 1.8
+      - 5.0
    * - ``torch.quint4x2``
      - Quantized 4-bit integer (unsigned)
+      - 1.8
+      - 5.0

 .. note::

-  Unsigned types, except ``uint8``, have limited support in eager mode. They
+  Unsigned types except ``uint8`` have limited support in eager mode. They
  primarily exist to assist usage with ``torch.compile``.

  See :doc:`ROCm precision support <rocm:reference/precision-support>` for the
  native hardware support of data types.

-Supported modules
--------------------------------------------------------------------------------
+torch.cuda
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-For a complete and up-to-date list of PyTorch core modules (for example., ``torch``,
-``torch.nn``, ``torch.cuda``, ``torch.backends.cuda`` and
-``torch.backends.cudnn``), their descriptions, and usage, please refer directly
-to the `official PyTorch documentation <https://pytorch.org/docs/stable/index.html>`_.
-
-Core PyTorch functionality on ROCm includes tensor operations, neural network
-layers, automatic differentiation, distributed training, mixed-precision
-training, compilation features, and domain-specific libraries for audio, vision,
-text processing, and more.
-
-Supported domain libraries
--------------------------------------------------------------------------------
-
-PyTorch offers specialized `domain libraries <https://pytorch.org/domains/>`_ with
-GPU acceleration that build on its core features to support specific application
-areas. The table below lists the PyTorch domain libraries that are compatible
-with ROCm.
+``torch.cuda`` in PyTorch is a module that provides utilities and functions for
+managing and utilizing AMD and NVIDIA GPUs. It enables GPU-accelerated
+computations, memory management, and efficient execution of tensor operations,
+leveraging ROCm and CUDA as the underlying frameworks.

 .. list-table::
    :header-rows: 1

-    * - Library
+    * - Feature
      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - Device management
+      - Utilities for managing and interacting with GPUs.
+      - 0.4.0
+      - 3.8
+    * - Tensor operations on GPU
+      - Performs tensor operations such as addition and matrix multiplications on
+        the GPU.
+      - 0.4.0
+      - 3.8
+    * - Streams and events
+      - Streams allow overlapping computation and communication for optimized
+        performance. Events enable synchronization.
+      - 1.6.0
+      - 3.8
+    * - Memory management
+      - Functions to manage and inspect memory usage like
+        ``torch.cuda.memory_allocated()``, ``torch.cuda.max_memory_allocated()``,
+        ``torch.cuda.memory_reserved()`` and ``torch.cuda.empty_cache()``.
+      - 0.3.0
+      - 1.9.2
+    * - Running process lists of memory management
+      - Returns a human-readable printout of the running processes and their GPU
+        memory use for a given device with functions like
+        ``torch.cuda.memory_stats()`` and ``torch.cuda.memory_summary()``.
+      - 1.8.0
+      - 4.0
+    * - Communication collectives
+      - Set of APIs that enable efficient communication between multiple GPUs,
+        allowing for distributed computing and data parallelism.
+      - 1.9.0
+      - 5.0
+    * - ``torch.cuda.CUDAGraph``
+      - Graphs capture sequences of GPU operations to minimize kernel launch
+        overhead and improve performance.
+      - 1.10.0
+      - 5.3
+    * - TunableOp
+      - A mechanism that allows certain operations to be more flexible and
+        optimized for performance. It enables automatic tuning of kernel
+        configurations and other settings to achieve the best possible
+        performance based on the specific hardware (GPU) and workload.
+      - 2.0
+      - 5.4
+    * - NVIDIA Tools Extension (NVTX)
+      - Integration with NVTX for profiling and debugging GPU performance using
+        NVIDIA's Nsight tools.
+      - 1.8.0
+      - ❌
+    * - Lazy loading NVRTC
+      - Delays JIT compilation with NVRTC until the code is explicitly needed.
+      - 1.13.0
+      - ❌
+    * - Jiterator (beta)
+      - Jiterator allows asynchronous data streaming into computation streams
+        during training loops.
+      - 1.13.0
+      - 5.2

-    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_ 
-      - Audio and signal processing library for PyTorch. Provides utilities for
-        audio I/O, signal and data processing functions, datasets, model
-        implementations, and application components for audio and speech
-        processing tasks.
+.. Need to validate and extend.

-        **Note:** To ensure GPU-acceleration with ``torchaudio.transforms``,
-        you need to explicitly move audio data (waveform tensor) to GPU using
-        ``.to('cuda')``.
+torch.backends.cuda
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-    * - `torchtune <https://docs.pytorch.org/torchtune/stable/index.html>`_
-      - PyTorch-native library designed for fine-tuning large language models
-        (LLMs). Provides supports the full fine-tuning workflow and offers
-        compatibility with popular production inference systems.
+``torch.backends.cuda`` is a PyTorch module that provides configuration options
+and flags to control the behavior of ROCm or CUDA operations. It is part of the
+PyTorch backend configuration system, which allows users to fine-tune how
+PyTorch interacts with the ROCm or CUDA environment.

-        **Note:** Only official release exists.
+.. list-table::
+    :header-rows: 1

-    * - `torchvision <https://docs.pytorch.org/vision/stable/index.html>`_
-      - Computer vision library that is part of the PyTorch project. Provides
-        popular datasets, model architectures, and common image transformations
-        for computer vision applications.
+    * - Feature
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - ``cufft_plan_cache``
+      - Manages caching of GPU FFT plans to optimize repeated FFT computations.
+      - 1.7.0
+      - 5.0
+    * - ``matmul.allow_tf32``
+      - Enables or disables the use of TensorFloat-32 (TF32) precision for
+        faster matrix multiplications on GPUs with Tensor Cores.
+      - 1.10.0
+      - ❌
+    * - ``matmul.allow_fp16_reduced_precision_reduction``
+      - Reduced precision reductions (e.g., with fp16 accumulation type) are
+        allowed with fp16 GEMMs.
+      - 2.0
+      - ❌
+    * - ``matmul.allow_bf16_reduced_precision_reduction``
+      - Reduced precision reductions are allowed with bf16 GEMMs.
+      - 2.0
+      - ❌
+    * - ``enable_cudnn_sdp``
+      - Globally enables cuDNN SDPA's kernels within SDPA.
+      - 2.0
+      - ❌
+    * - ``enable_flash_sdp``
+      - Globally enables or disables FlashAttention for SDPA.
+      - 2.1
+      - ❌
+    * - ``enable_mem_efficient_sdp``
+      - Globally enables or disables Memory-Efficient Attention for SDPA.
+      - 2.1
+      - ❌
+    * - ``enable_math_sdp``
+      - Globally enables or disables the PyTorch C++ implementation within SDPA.
+      - 2.1
+      - ❌

-    * - `torchtext <https://docs.pytorch.org/text/stable/index.html>`_
-      - Text processing library for PyTorch. Provides data processing utilities
-        and popular datasets for natural language processing, including
-        tokenization, vocabulary management, and text embeddings.
+.. Need to validate and extend.

-        **Note:** ``torchtext`` does not implement ROCm-specific kernels. 
-        ROCm acceleration is provided through the underlying PyTorch framework
-        and ROCm library integration. Only official release exists.
+torch.backends.cudnn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-    * - `torchdata <https://docs.pytorch.org/data/beta/index.html>`_
-      - Beta library of common modular data loading primitives for easily
-        constructing flexible and performant data pipelines, with features still
-        in prototype stage.
+Supported ``torch`` options include:

-    * - `torchrec <https://docs.pytorch.org/torchrec/>`_
-      - PyTorch domain library for common sparsity and parallelism primitives
-        needed for large-scale recommender systems, enabling authors to train
-        models with large embedding tables shared across many GPUs.
+.. list-table::
+    :header-rows: 1

-        **Note:** ``torchrec`` does not implement ROCm-specific kernels. ROCm
-        acceleration is provided through the underlying PyTorch framework and
-        ROCm library integration.
+    * - Option
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - ``allow_tf32``
+      - TensorFloat-32 tensor cores may be used in cuDNN convolutions on NVIDIA
+        Ampere or newer GPUs.
+      - 1.12.0
+      - ❌
+    * - ``deterministic``
+      - A bool that, if True, causes cuDNN to only use deterministic
+        convolution algorithms.
+      - 1.12.0
+      - 6.0

-    * - `torchserve <https://docs.pytorch.org/serve/>`_
-      - Performant, flexible and easy-to-use tool for serving PyTorch models in
-        production, providing features for model management, batch processing,
-        and scalable deployment.
+Automatic mixed precision: torch.amp
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-        **Note:** `torchserve <https://docs.pytorch.org/serve/>`_ is no longer
-        actively maintained. Last official release is sent out with PyTorch 2.4.
+PyTorch automates the process of using both 16-bit (half-precision, float16) and
+32-bit (single-precision, float32) floating-point types in model training and
+inference.

-    * - `torchrl <https://docs.pytorch.org/rl/stable/index.html>`_
-      - Open-source, Python-first Reinforcement Learning library for PyTorch
-        with a focus on high modularity and good runtime performance, providing
-        low and high-level RL abstractions and reusable functionals for cost
-        functions, returns, and data processing.
+.. list-table::
+    :header-rows: 1

-        **Note:** Only official release exists.
+    * - Feature
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - Autocasting
+      - Autocast instances serve as context managers or decorators that allow
+        regions of your script to run in mixed precision.
+      - 1.9
+      - 2.5
+    * - Gradient scaling
+      - To prevent underflow, “gradient scaling” multiplies the network’s
+        loss by a scale factor and invokes a backward pass on the scaled
+        loss. The same factor then scales gradients flowing backward through
+        the network. In other words, gradient values have a larger magnitude so
+        that they don’t flush to zero.
+      - 1.9
+      - 2.5
+    * - CUDA op-specific behavior
+      - These ops always go through autocasting whether they are invoked as part
+        of a ``torch.nn.Module``, as a function, or as a ``torch.Tensor`` method. If
+        functions are exposed in multiple namespaces, they go through
+        autocasting regardless of the namespace.
+      - 1.9
+      - 2.5

-    * - `tensordict <https://docs.pytorch.org/tensordict/stable/index.html>`_
-      - Dictionary-like class that simplifies operations on batches of tensors,
-        enhancing code readability, compactness, and modularity by abstracting
-        tailored operations and reducing errors through automatic operation
-        dispatching.
+Distributed library features
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-        **Note:** Only official release exists.
+PyTorch distributed library includes a collective of parallelism modules, a
+communications layer, and infrastructure for launching and debugging large
+training jobs. See :ref:`rocm-for-ai-pytorch-distributed` for more information.
+
+The Distributed Library feature in PyTorch provides tools and APIs for building
+and running distributed machine learning workflows. It allows training models
+across multiple processes, GPUs, or nodes in a cluster, enabling efficient use
+of computational resources and scalability for large-scale tasks.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - TensorPipe
+      - A point-to-point communication library integrated into
+        PyTorch for distributed training. It handles tensor data transfers
+        efficiently between different processes or devices, including those on
+        separate machines.
+      - 1.8
+      - 5.4
+    * - Gloo
+      - Designed for multi-machine and multi-GPU setups, enabling
+        efficient communication and synchronization between processes. Gloo is
+        one of the default backends for PyTorch's Distributed Data Parallel
+        (DDP) and RPC frameworks, alongside other backends like NCCL and MPI.
+      - 1.0
+      - 2.0
+
+torch.compiler
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - ``torch.compiler`` (AOT Autograd)
+      - Autograd captures not only the user-level code, but also backpropagation,
+        which results in capturing the backwards pass “ahead-of-time”. This
+        enables acceleration of both forwards and backwards pass using
+        ``TorchInductor``.
+      - 2.0
+      - 5.3
+    * - ``torch.compiler`` (TorchInductor)
+      - The default ``torch.compile`` deep learning compiler that generates fast
+        code for multiple accelerators and backends. You need to use a backend
+        compiler to make speedups through ``torch.compile`` possible. For AMD,
+        NVIDIA, and Intel GPUs, it leverages OpenAI Triton as the key building block.
+      - 2.0
+      - 5.3
+
+torchaudio
+--------------------------------------------------------------------------------
+
+The `torchaudio <https://pytorch.org/audio/stable/index.html>`_ library provides
+utilities for processing audio data in PyTorch, such as audio loading,
+transformations, and feature extraction.
+
+To ensure GPU-acceleration with ``torchaudio.transforms``, you need to
+explicitly move audio data (waveform tensor) to GPU using ``.to('cuda')``.
+
+The following ``torchaudio`` features are GPU-accelerated.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of torchaudio version
+      - As of ROCm
+    * - ``torchaudio.transforms.Spectrogram``
+      - Generate a spectrogram of an input waveform using STFT.
+      - 0.6.0
+      - 4.5
+    * - ``torchaudio.transforms.MelSpectrogram``
+      - Generates the mel-scale spectrogram of raw audio signals.
+      - 0.9.0
+      - 4.5
+    * - ``torchaudio.transforms.MFCC``
+      - Extract of MFCC features.
+      - 0.9.0
+      - 4.5
+    * - ``torchaudio.transforms.Resample``
+      - Resamples a signal from one frequency to another.
+      - 0.9.0
+      - 4.5
+
+torchvision
+--------------------------------------------------------------------------------
+
+The `torchvision <https://pytorch.org/vision/stable/index.html>`_ library
+provides datasets, model architectures, and common image transformations for
+computer vision.
+
+The following ``torchvision`` features are GPU-accelerated.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of torchvision version
+      - As of ROCm
+    * - ``torchvision.transforms.functional``
+      - Provides GPU-compatible transformations for image preprocessing like
+        resize, normalize, rotate and crop.
+      - 0.2.0
+      - 4.0
+    * - ``torchvision.ops``
+      - GPU-accelerated operations for object detection and segmentation tasks.
+        ``torchvision.ops.roi_align``, ``torchvision.ops.nms`` and
+        ``box_convert``.
+      - 0.6.0
+      - 3.3
+    * - ``torchvision.models`` with ``.to('cuda')``
+      - ``torchvision`` provides several pre-trained models (ResNet, Faster
+        R-CNN, Mask R-CNN, ...) that can run on CUDA for faster inference and
+        training.
+      - 0.1.6
+      - 2.x
+    * - ``torchvision.io``
+      - Enables video decoding and frame extraction using GPU acceleration with NVIDIA’s
+        NVDEC and nvJPEG (rocJPEG) on CUDA-enabled GPUs.
+      - 0.4.0
+      - 6.3
+
+torchtext
+--------------------------------------------------------------------------------
+
+The `torchtext <https://pytorch.org/text/stable/index.html>`_ library provides
+utilities for processing and working with text data in PyTorch, including
+tokenization, vocabulary management, and text embeddings. torchtext supports
+preprocessing pipelines and integration with PyTorch models, simplifying the
+implementation of natural language processing (NLP) tasks.
+
+To leverage GPU acceleration in torchtext, you need to move tensors
+explicitly to the GPU using ``.to('cuda')``.
+
+* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
+
+* Only official release exists.
+
+torchtune
+--------------------------------------------------------------------------------
+
+The `torchtune <https://pytorch.org/torchtune/stable/index.html>`_ library for
+authoring, fine-tuning and experimenting with LLMs.
+
+* Usage: Enabling developers to fine-tune ROCm PyTorch solutions.
+
+* Only official release exists.
+
+torchserve
+--------------------------------------------------------------------------------
+
+The `torchserve <https://pytorch.org/serve/>`_ is a PyTorch domain library
+for common sparsity and parallelism primitives needed for large-scale recommender
+systems.
+
+* torchtext does not implement its own kernels. ROCm support is enabled by
+  linking against ROCm libraries.
+
+* Only official release exists.
+
+torchrec
+--------------------------------------------------------------------------------
+
+The `torchrec <https://pytorch.org/torchrec/>`_ is a PyTorch domain library for
+common sparsity and parallelism primitives needed for large-scale recommender
+systems.
+
+* torchrec does not implement its own kernels. ROCm support is enabled by
+  linking against ROCm libraries.
+
+* Only official release exists.
+
+Unsupported PyTorch features
+================================================================================
+
+The following GPU-accelerated PyTorch features are not supported by ROCm for
+the listed supported PyTorch versions.
+
+.. list-table::
+    :widths: 30, 60, 10
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of PyTorch
+    * - APEX batch norm
+      - Use APEX batch norm instead of PyTorch batch norm.
+      - 1.6.0
+    * - ``torch.backends.cuda`` / ``matmul.allow_tf32``
+      - A bool that controls whether TensorFloat-32 tensor cores may be used in
+        matrix multiplications.
+      - 1.7
+    * - ``torch.cuda`` / NVIDIA Tools Extension (NVTX)
+      - Integration with NVTX for profiling and debugging GPU performance using
+        NVIDIA's Nsight tools.
+      - 1.7.0
+    * - ``torch.cuda`` / Lazy loading NVRTC
+      - Delays JIT compilation with NVRTC until the code is explicitly needed.
+      - 1.8.0
+    * - ``torch-tensorrt``
+      - Integrate TensorRT library for optimizing and deploying PyTorch models.
+        ROCm does not have equialent library for TensorRT.
+      - 1.9.0
+    * - ``torch.backends`` / ``cudnn.allow_tf32``
+      - TensorFloat-32 tensor cores may be used in cuDNN convolutions.
+      - 1.10.0
+    * - ``torch.backends.cuda`` / ``matmul.allow_fp16_reduced_precision_reduction``
+      - Reduced precision reductions with fp16 accumulation type are
+        allowed with fp16 GEMMs.
+      - 2.0
+    * - ``torch.backends.cuda`` / ``matmul.allow_bf16_reduced_precision_reduction``
+      - Reduced precision reductions are allowed with bf16 GEMMs.
+      - 2.0
+    * - ``torch.nn.functional`` / ``scaled_dot_product_attention``
+      - Flash attention backend for SDPA to accelerate attention computation in
+        transformer-based models.
+      - 2.0
+    * - ``torch.backends.cuda`` / ``enable_cudnn_sdp``
+      - Globally enables cuDNN SDPA's kernels within SDPA.
+      - 2.0
+    * - ``torch.backends.cuda`` / ``enable_flash_sdp``
+      - Globally enables or disables FlashAttention for SDPA.
+      - 2.1
+    * - ``torch.backends.cuda`` / ``enable_mem_efficient_sdp``
+      - Globally enables or disables Memory-Efficient Attention for SDPA.
+      - 2.1
+    * - ``torch.backends.cuda`` / ``enable_math_sdp``
+      - Globally enables or disables the PyTorch C++ implementation within SDPA.
+      - 2.1
+    * - Dynamic parallelism
+      - PyTorch itself does not directly expose dynamic parallelism as a core
+        feature. Dynamic parallelism allow GPU threads to launch additional
+        threads which can be reached using custom operations via the
+        ``torch.utils.cpp_extension`` module.
+      - Not a core feature
+    * - Unified memory support in PyTorch
+      - Unified Memory is not directly exposed in PyTorch's core API, it can be
+        utilized effectively through custom CUDA extensions or advanced
+        workflows.
+      - Not a core feature
--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -56,7 +56,7 @@ Docker image compatibility
 AMD validates and publishes ready-made `TensorFlow images
 <https://hub.docker.com/r/rocm/tensorflow>`_ with ROCm backends on
 Docker Hub. The following Docker image tags and associated inventories are
-validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click
+validated for `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_. Click
 the |docker-icon| icon to view the image on Docker Hub.

 .. list-table:: TensorFlow Docker image components
@@ -73,122 +73,82 @@ the |docker-icon| icon to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.18-dev/images/sha256-fa9cf5fa6c6079a7118727531ccd0056c6e3224a42c3d6e78a49e7781daafff4"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.18-runtime/images/sha256-d14d8c4989e7c9a60f4e72461b9e349de72347c6162dcd6897e6f4f80ffbb440"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.18-runtime/images/sha256-14addca4b92a47c806b83ebaeed593fc6672cd99f0017ed8dad759fe72ed0309"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.18-dev/images/sha256-081e5bd6615a5dc17247ebd2ccc26895c3feeff086720400fa39b477e60a77c0"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.18-dev/images/sha256-f5e151060df04ff5fb59f5604b49cd371931bbe75b06aec9fe7781397c4be0ce"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - dev
      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.18-runtime/images/sha256-bf369637378264f4af6ddad5ca8b8611d3e372ffbea9ab7a06f1e122f0a0867b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.18-runtime/images/sha256-5cd4c03fdb1036570c0d4929da60a65c4466998dc80f1dc8a5a0b173eae017fb"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.17-dev/images/sha256-5a502008c50d0b6508e6027f911bdff070a7493700ae064bed74e1d22b91ed50"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.17-dev/images/sha256-b3add80e374a2db2d1088d746e740afa89d439aca02cacba959ad298f5cd2b3f"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.17-runtime/images/sha256-1ee5dfffceb71ac66617ada33de3a10de0cb74199cc4b82441192e5e92fa2ddf"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.17-runtime/images/sha256-3a244f026c32177eff7958ffbad390de85b438b2b48b455cc39f15d70fa1270d"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-3124/>`_
+      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.17-dev/images/sha256-109218ad92bfae83bbd2710475f7502166e1ed54ca0b9748a9cbc3f5a1d75af1"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.17-dev/images/sha256-e0cecdfacb59169335049983cdab6da578c209bb9f4d08aad97e184ae59171a6"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - dev
      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.17-runtime/images/sha256-5d78bd5918d394f92263daa2990e88d695d27200dd90ed83ec64d20c7661c9c1"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.17-runtime/images/sha256-6f43de12f7eb202791b698ac51d28b72098de90034dbcd48486629b0125f7707"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_

-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.16-dev/images/sha256-b09b1ad921c09c687b7c916141051e9fcf15539a5686e5aa67c689195a522719"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - dev
-      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
-      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.16-runtime/images/sha256-20dbd824e85558abfe33fc9283cc547d88cde3c623fe95322743a5082f883a64"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - runtime
-      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
-      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.16.2 <https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377>`__
-      - dev
-      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
-      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-runtime/images/sha256-a94150ffb81365234ebfa34e764db5474bc6ab7d141b56495eac349778dafcf3"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - runtime
-      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
-      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
-

 Critical ROCm libraries for TensorFlow
 ===============================================================================
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -51,15 +51,12 @@ article_pages = [
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
@@ -69,11 +66,11 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/install", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/pytorch-inference-benchmark", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
@@ -1,152 +0,0 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250513
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4
-      rocm_version: 6.3.1
-      vllm_version: 0.8.5
-      pytorch_version: 2.7.0+gitf717b2a
-      hipblaslt_version: 0.15
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-      - model: Llama 3.1 70B
-        mad_tag: pyt_vllm_llama-3.1-70b
-        model_repo: meta-llama/Llama-3.1-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-        precision: float16
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-      - model: Llama 3.2 11B Vision
-        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
-        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
-        precision: float16
-      - model: Llama 2 7B
-        mad_tag: pyt_vllm_llama-2-7b
-        model_repo: meta-llama/Llama-2-7b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-        precision: float16
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 70B FP8
-        mad_tag: pyt_vllm_llama-3.1-70b_fp8
-        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-    - group: Mistral AI
-      tag: mistral
-      models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-      - model: Mistral 7B
-        mad_tag: pyt_vllm_mistral-7b
-        model_repo: mistralai/Mistral-7B-Instruct-v0.3
-        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
-        precision: float16
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mistral 7B FP8
-        mad_tag: pyt_vllm_mistral-7b_fp8
-        model_repo: amd/Mistral-7B-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
-        precision: float8
-    - group: Qwen
-      tag: qwen
-      models:
-      - model: Qwen2 7B
-        mad_tag: pyt_vllm_qwen2-7b
-        model_repo: Qwen/Qwen2-7B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
-        precision: float16
-      - model: Qwen2 72B
-        mad_tag: pyt_vllm_qwen2-72b
-        model_repo: Qwen/Qwen2-72B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
-        precision: float16
-      - model: QwQ-32B
-        mad_tag: pyt_vllm_qwq-32b
-        model_repo: Qwen/QwQ-32B
-        url: https://huggingface.co/Qwen/QwQ-32B
-        precision: float16
-        tunableop: true
-    - group: Databricks DBRX
-      tag: dbrx
-      models:
-      - model: DBRX Instruct
-        mad_tag: pyt_vllm_dbrx-instruct
-        model_repo: databricks/dbrx-instruct
-        url: https://huggingface.co/databricks/dbrx-instruct
-        precision: float16
-      - model: DBRX Instruct FP8
-        mad_tag: pyt_vllm_dbrx_fp8
-        model_repo: amd/dbrx-instruct-FP8-KV
-        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
-        precision: float8
-    - group: Google Gemma
-      tag: gemma
-      models:
-      - model: Gemma 2 27B
-        mad_tag: pyt_vllm_gemma-2-27b
-        model_repo: google/gemma-2-27b
-        url: https://huggingface.co/google/gemma-2-27b
-        precision: float16
-    - group: Cohere
-      tag: cohere
-      models:
-      - model: C4AI Command R+ 08-2024
-        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
-        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
-        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
-        precision: float16
-      - model: C4AI Command R+ 08-2024 FP8
-        mad_tag: pyt_vllm_command-r-plus_fp8
-        model_repo: amd/c4ai-command-r-plus-FP8-KV
-        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
-        precision: float8
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek MoE 16B
-        mad_tag: pyt_vllm_deepseek-moe-16b-chat
-        model_repo: deepseek-ai/deepseek-moe-16b-chat
-        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
-        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -23,11 +23,3 @@ pytorch_inference_benchmark:
        model_repo: meta-llama/Llama-3.1-8B-Instruct
        url: https://huggingface.co/chaidiscovery/chai-1
        precision: float16
-    - group: Mochi Video
-      tag: mochi
-      models:
-      - model: Mochi 1
-        mad_tag: pyt_mochi_video_inference
-        model_repo: genmo/mochi-1-preview
-        url: https://huggingface.co/genmo/mochi-1-preview
-        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,14 +1,14 @@
 vllm_benchmark:
  unified_docker:
    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
+      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845
      rocm_version: 6.3.1
-      vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
-      pytorch_version: 2.7.0+gitf717b2a
-      hipblaslt_version: 0.15
+      vllm_version: 0.8.3
+      pytorch_version: 2.7.0 (dev nightly)
+      hipblaslt_version: 0.13
  model_groups:
-    - group: Meta Llama
+    - group: Llama
      tag: llama
      models:
      - model: Llama 3.1 8B
@@ -56,7 +56,7 @@ vllm_benchmark:
        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
        precision: float8
-    - group: Mistral AI
+    - group: Mistral
      tag: mistral
      models:
      - model: Mixtral MoE 8x7B
@@ -108,7 +108,7 @@ vllm_benchmark:
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
        tunableop: true
-    - group: Databricks DBRX
+    - group: DBRX
      tag: dbrx
      models:
      - model: DBRX Instruct
@@ -121,7 +121,7 @@ vllm_benchmark:
        model_repo: amd/dbrx-instruct-FP8-KV
        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
        precision: float8
-    - group: Google Gemma
+    - group: Gemma
      tag: gemma
      models:
      - model: Gemma 2 27B
@@ -150,13 +150,6 @@ vllm_benchmark:
        model_repo: deepseek-ai/deepseek-moe-16b-chat
        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
        precision: float16
-    - group: Microsoft Phi
-      tag: phi
-      models:
-      - model: Phi-4
-        mad_tag: pyt_vllm_phi-4
-        model_repo: microsoft/phi-4
-        url: https://huggingface.co/microsoft/phi-4
    - group: TII Falcon
      tag: falcon
      models:
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,29 +0,0 @@
-megatron-lm_benchmark:
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
-      - model: Llama 3.3 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
-      - model: Llama 3.1 8B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
-      - model: Llama 3.1 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
-      - model: Llama 2 7B
-        mad_tag: pyt_megatron_lm_train_llama-2-7b
-      - model: Llama 2 70B
-        mad_tag: pyt_megatron_lm_train_llama-2-70b
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek-V3
-        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
-      - model: DeepSeek-V2-Lite
-        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-    - group: Mistral AI
-      tag: mistral
-      models:
-      - model: Mixtral 8x7B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -678,7 +678,7 @@ To specify the quantization scaling config, use the
 ``--quantization-param-path`` parameter. If the parameter is not specified,
 the default scaling factor of ``1`` is used, which can lead to less accurate
 results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
-Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
+Cache <https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md>`__
 in the vLLM GitHub repository.

 Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
@@ -1,348 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
-                 ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   performance benchmark documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
-
-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
-   accelerators and includes the following components:
-
-   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
-
-   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
-
-   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
-
-   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
-
-   With this Docker image, you can quickly test the :ref:`expected
-   inference performance numbers <vllm-benchmark-performance-measurements>` for
-   MI300X series accelerators.
-
-   .. _vllm-benchmark-available-models:
-
-   Supported models
-   ================
-
-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model group</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
-      </div>
-
-   .. _vllm-benchmark-vllm:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-
-      {% endfor %}
-   {% endfor %}
-
-   .. note::
-
-      vLLM is a toolkit and library for LLM inference and serving. AMD implements
-      high-performance custom kernels and modules in vLLM to enhance performance.
-      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-      more information.
-
-   .. _vllm-benchmark-performance-measurements:
-
-   Performance measurements
-   ========================
-
-   To evaluate performance, the
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   page provides reference throughput and latency measurements for inferencing
-   popular AI models.
-
-   .. note::
-
-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
-
-   Advanced features and known issues
-   ==================================
-
-   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/blob/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker/README.md>`__.
-
-   System validation
-   =================
-
-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
-
-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
-
-   .. code-block:: shell
-
-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
-
-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
-
-   Pull the Docker image
-   =====================
-
-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ unified_docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-            directory and install the required packages on the host machine.
-
-            .. code-block:: shell
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD
-               pip install -r requirements.txt
-
-            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-            using one GPU with the ``{{model.precision}}`` data type on the host machine.
-
-            .. code-block:: shell
-
-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
-
-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
-            to collect latency and throughput performance data, you can also change the benchmarking
-            parameters. See the standalone benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
-               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
-               enable it, edit the default run behavior in the ``models.json``
-               configuration before running inference -- update the model's run
-               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            Run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ unified_docker.docker_hub_url }}>`_
-            as shown in the following snippet.
-
-            .. code-block::
-
-               docker pull {{ unified_docker.pull_tag }}
-               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
-
-            In the Docker container, clone the ROCm MAD repository and navigate to the
-            benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-            .. code-block::
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD/scripts/vllm
-
-            To start the benchmark, use the following command with the appropriate options.
-
-            .. code-block::
-
-               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
-
-            .. list-table::
-               :header-rows: 1
-               :align: center
-
-               * - Name
-                 - Options
-                 - Description
-
-               * - ``$test_option``
-                 - latency
-                 - Measure decoding token latency
-
-               * -
-                 - throughput
-                 - Measure token generation throughput
-
-               * -
-                 - all
-                 - Measure both throughput and latency
-
-               * - ``$num_gpu``
-                 - 1 or 8
-                 - Number of GPUs
-
-               * - ``$datatype``
-                 - ``float16`` or ``float8``
-                 - Data type
-
-            .. note::
-
-               The input sequence length, output sequence length, and tensor parallel (TP) are
-               already configured. You don't need to specify them with this script.
-
-            .. note::
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            Here are some examples of running the benchmark with various options.
-
-            * Latency benchmark
-
-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
-
-              .. code-block::
-
-                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
-
-            * Throughput benchmark
-
-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
-
-              .. code-block:: shell
-
-                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Further reading
-===============
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../../inference-optimization/workload`.
-
- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <../../hugging-face-models>`.
-
- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../../inference-optimization/index>`.
-
- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../../fine-tuning/index>`.
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -20,8 +20,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram

 - :doc:`LLM inference frameworks <llm-inference-frameworks>`

- :doc:`vLLM inference performance testing <vllm-benchmark>`
-
- :doc:`PyTorch inference performance testing <pytorch-inference-benchmark>`
+- :doc:`Performance testing <vllm-benchmark>`

 - :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/inference/install.rst
+++ b/docs/how-to/rocm-for-ai/inference/install.rst
@@ -30,7 +30,7 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install

 * :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/amdgpu-install>`

-* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`
+* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`.

 .. grid:: 1

@@ -59,8 +59,4 @@ images with the framework pre-installed.

 * :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`

-Next steps
-==========
-
-After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
-to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.
+The sections that follow in :doc:`Training a model <../training/train-a-model>` are geared for a ROCm with PyTorch installation.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -24,24 +24,20 @@ PyTorch inference performance testing
   Supported models
   ================

-   The following models are supported for inference performance benchmarking
-   with PyTorch and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
-            <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>

        <div class="row mt-1" style="display: none;">
-          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="col-2 me-2 model-param-head">Model variant</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
@@ -66,52 +62,47 @@ PyTorch inference performance testing
      {% endfor %}
   {% endfor %}

-   System validation
-   =================
+   Getting started
+   ===============

-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+   Use the following procedures to reproduce the benchmark results on an
+   MI300X series accelerator with the prebuilt PyTorch Docker image.

-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+   .. _pytorch-benchmark-get-started:

-   .. code-block:: shell
+   1. Disable NUMA auto-balancing.

-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
+      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+      might hang until the periodic balancing is finalized. For more information,
+      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.

-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+      .. code-block:: shell

-   Pull the Docker image
-   =====================
+         # disable automatic NUMA balancing
+         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+         # check if NUMA balancing is disabled (returns 0 if disabled)
+         cat /proc/sys/kernel/numa_balancing
+         0

   .. container:: model-doc pyt_chai1_inference

-      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
+      2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
+            docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue

-      .. note::
+         .. note::

-         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
+            The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.

-   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference
+   .. container:: model-doc pyt_clip_inference

-      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
+      2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker pull rocm/pytorch:latest
-
-   .. _pytorch-benchmark-get-started:
+            docker pull rocm/pytorch:latest

   Benchmarking
   ============
@@ -166,14 +157,11 @@ Further reading
 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
-
 - To learn how to run LLM models from Hugging Face or your model, see
-  :doc:`Running models from Hugging Face <../hugging-face-models>`.
+  :doc:`Running models from Hugging Face <hugging-face-models>`.

 - To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../inference-optimization/index>`.
+  :doc:`Inference optimization <../inference-optimization/index>`.

 - To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
+  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -24,7 +24,7 @@ vLLM inference performance testing

   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_

-   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_

   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_

@@ -37,15 +37,11 @@ vLLM inference performance testing
   Supported models
   ================

-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
@@ -54,7 +50,7 @@ vLLM inference performance testing
        </div>

        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="col-2 me-2 model-param-head">Model variant</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
@@ -115,37 +111,35 @@ vLLM inference performance testing
   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.

-   System validation
-   =================
+   Getting started
+   ===============

-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+   Use the following procedures to reproduce the benchmark results on an
+   MI300X accelerator with the prebuilt vLLM Docker image.

-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+   .. _vllm-benchmark-get-started:

-   .. code-block:: shell
+   1. Disable NUMA auto-balancing.

-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
+      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+      might hang until the periodic balancing is finalized. For more information,
+      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.

-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+      .. code-block:: shell

-   Pull the Docker image
-   =====================
+         # disable automatic NUMA balancing
+         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+         # check if NUMA balancing is disabled (returns 0 if disabled)
+         cat /proc/sys/kernel/numa_balancing
+         0

-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
+   2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.

-   .. code-block:: shell
+      Use the following command to pull the Docker image from Docker Hub.

-      docker pull {{ unified_docker.pull_tag }}
+      .. code-block:: shell
+
+         docker pull {{ unified_docker.pull_tag }}

   Benchmarking
   ============
@@ -282,7 +276,7 @@ vLLM inference performance testing

            * Latency benchmark

-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.

              .. code-block::

@@ -292,11 +286,11 @@ vLLM inference performance testing

            * Throughput benchmark

-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+              Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.

              .. code-block:: shell

-                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}

              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.

@@ -322,23 +316,23 @@ vLLM inference performance testing
 Further reading
 ===============

+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`../inference-optimization/workload`.
+
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
-
 - To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <../hugging-face-models>`.
+  :doc:`Running models from Hugging Face <hugging-face-models>`.

 - To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../inference-optimization/index>`.
+  :doc:`Inference optimization <../inference-optimization/index>`.

 - To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
+  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.

 Previous versions
 =================
@@ -356,20 +350,6 @@ for benchmarking, see the version-specific documentation.
     - PyTorch version
     - Resources

-   * - 6.3.1
-     - 0.8.5
-     - 2.7.0
-     - 
-       * :doc:`Documentation <previous-versions/vllm-0.8.5-20250513>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`_
-
-   * - 6.3.1
-     - 0.8.3
-     - 2.7.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.4.0/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`_
-
   * - 6.3.1
     - 0.7.3
     - 2.7.0
--- a/docs/how-to/rocm-for-ai/system-health-check.rst
+++ b/docs/how-to/rocm-for-ai/system-health-check.rst
@@ -1,104 +0,0 @@
-.. meta::
-   :description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
-   :keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference
-
-.. _rocm-for-ai-system-health-bench:
-
-************************
-System health benchmarks
-************************
-
-Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).
-
-ROCm Validation Suite (RVS) tests
-=================================
-
-RVS provides a collection of tests, benchmarks, and qualification tools, each
-targeting a specific subsystem of the system under test. It includes tests for
-GPU stress and memory bandwidth.
-
-.. _healthcheck-install-rvs:
-
-Install ROCm Validation Suite
-----------------------------
-
-To get started, install RVS. For example, on an Ubuntu system with ROCm already
-installed, run the following command:
-
-.. code-block:: shell
-
-   sudo apt update
-   sudo apt install rocm-validation-suite
-
-See the `ROCm Validation Suite installation instructions <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/install/installation.html>`_,
-and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#system-validation-tests>`_
-in the Instinct documentation for more detailed instructions.
-
-Benchmark, stress, and qualification tests
------------------------------------------
-
-The GPU stress test runs various GEMM computations as workloads to stress the GPU FLOPS performance and check whether it
-meets the configured target GFLOPS.
-
-Run the benchmark, stress, and qualification tests included with RVS. See the `Benchmark, stress, qualification
-<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#benchmark-stress-qualification>`_
-section of the Instinct documentation for usage instructions.
-
-BabelStream test
----------------
-
-BabelStream is a synthetic GPU benchmark based on the STREAM benchmark for
-CPUs, measuring memory transfer rates to and from global device memory.
-BabelStream tests are included with the RVS package as part of the `BABEL module
-<https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/conceptual/rvs-modules.html#babel-benchmark-test-babel-module>`_.
-
-For more information, see `Performance benchmarking
-<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#babelstream-benchmarking-results>`_
-in the Instinct documentation.
-
-RCCL tests
-==========
-
-The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
-communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
-the performance and verifies the correctness of these collective operations.
-This helps ensure optimal scaling for multi-accelerator tasks.
-
-1. To get started, build RCCL-tests using the official instructions in the README at
-   `<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
-   following commands:
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/rccl-tests.git
-      cd rccl-tests
-      make
-
-2. Run the suggested RCCL tests -- see `RCCL benchmarking
-   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#rccl-benchmarking-results>`_
-   in the Instinct performance benchmarking documentation for instructions.
-
-TransferBench test
-==================
-
-TransferBench is a standalone utility for benchmarking simultaneous data
-transfer performance between various devices in the system, including
-CPU-to-GPU and GPU-to-GPU (peer-to-peer). This helps identify potential
-bottlenecks in data movement between the host system and the GPUs, or between
-GPUs, which can impact end-to-end latency.
-
-.. _healthcheck-install-transferbench:
-
-1. To get started, use the instructions in the `TransferBench documentation
-   <https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
-   or use the following commands:
-
-   .. code:: shell
-
-      git clone https://github.com/ROCm/TransferBench.git
-      cd TransferBench
-      CC=hipcc make
-
-2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
-   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
-   in the Instinct performance benchmarking documentation for instructions.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -79,18 +79,11 @@ across different input sequences. Support for packed input format is planned for
 System validation
 =================

-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
 before starting training.

-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
 Environment setup
 =================

@@ -182,8 +175,8 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.

 .. _amd-maxtext-download-docker:

-Pull the Docker image
---------------------
+Download the Docker image
+-------------------------

 1. Use the following command to pull the Docker image from Docker Hub.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
@@ -1,175 +0,0 @@
-.. meta::
-   :description: How to train a model using LLM Foundry for ROCm.
-   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
-
-******************************************
-Training MPT-30B with LLM Foundry and ROCm
-******************************************
-
-MPT-30B is a 30-billion parameter decoder-style transformer-based model from
-the Mosaic Pretrained Transformer (MPT) family -- learn more about it in
-MosaicML's research blog `MPT-30B: Raising the bar for open-source foundation
-models <https://www.databricks.com/blog/mpt-30b>`_.
-
-ROCm and `<https://github.com/ROCm/MAD>`__ provide a pre-configured training
-environment for the MPT-30B model using the ``rocm/pytorch-training:v25.5``
-base `Docker image <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
-and the `LLM Foundry <https://github.com/mosaicml/llm-foundry>`_ framework.
-This environment packages the following software components to train
-on AMD Instinct MI300X series accelerators:
-
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.7.0a0+git6374332             |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0.post1                    |
-+--------------------------+--------------------------------+
-
-Using this image, you can build, run, and test the training process
-for MPT-30B with access to detailed logs and performance metrics.
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-Getting started
-===============
-
-The following procedures help you set up the training environment in a
-reproducible Docker container. This training environment is tailored for
-training MPT-30B using LLM Foundry and the specific model configurations outlined.
-Other configurations and run conditions outside those described in this
-document are not validated.
-
-.. tab-set::
-
-   .. tab-item:: MAD-integrated benchmarking
-
-      On your host machine, clone the ROCm Model Automation and Dashboarding
-      (`<https://github.com/ROCm/MAD>`__) repository to a local directory and
-      install the required packages.
-
-      .. code-block:: shell
-
-         git clone https://github.com/ROCm/MAD
-         cd MAD
-         pip install -r requirements.txt
-
-      Use this command to initiate the MPT-30B training benchmark.
-
-      .. code-block:: shell
-
-         python3 tools/run_models.py --tags pyt_mpt30b_training --keep-model-dir --live-output --clean-docker-cache
-
-      .. tip::
-
-         If you experience data download failures, set the
-         ``MAD_SECRETS_HFTOKEN`` variable to your Hugging Face access token. See
-         `User access tokens <https://huggingface.co/docs/hub/security-tokens>`_
-         for details.
-
-         .. code-block:: shell
-
-            export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-
-      .. note::
-
-         For improved performance (training throughput), consider enabling TunableOp.
-         By default, ``pyt_mpt30b_training`` runs with TunableOp disabled. To enable it,
-         run ``tools/run_models.py`` with the ``--tunableop on`` argument or edit the
-         ``models.json`` configuration before running training.
-
-         Although this might increase the initial training time, it can result in a performance gain.
-
-   .. tab-item:: Standalone benchmarking
-
-      To set up the training environment, clone the
-      `<https://github.com/ROCm/MAD>`__ repo and build the Docker image. In
-      this snippet, the image is named ``mosaic_mpt30_image``.
-
-      .. code-block:: shell
-
-         git clone https://github.com/ROCm/MAD
-         cd MAD
-
-         docker build --build-arg MAD_SYSTEM_GPU_ARCHITECTURE=gfx942 -f docker/pyt_mpt30b_training.ubuntu.amd.Dockerfile -t mosaic_mpt30_image .
-
-      Start a ``mosaic_mpt30_image`` container using the following command.
-
-      .. code-block:: shell
-
-         docker run -it --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --shm-size=8G mosaic_mpt30_image
-
-      In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-      repository and navigate to the benchmark scripts directory at
-      ``/workspace/MAD/scripts/pyt_mpt30b_training``.
-
-      .. code-block:: shell
-
-         git clone https://github.com/ROCm/MAD
-         cd MAD/scripts/pyt_mpt30b_training
-
-      To initiate the training process, use the following command. This script uses the hyperparameters defined in
-      ``mpt-30b-instruct.yaml``.
-
-      .. code-block:: shell
-
-         source run.sh
-
-      .. note::
-
-         For improved performance (training throughput), consider enabling TunableOp.
-         To enable it, add the ``--tunableop on`` flag.
-
-         .. code-block:: shell
-
-            source run.sh --tunableop on
-
-         Although this might increase the initial training time, it can result in a performance gain.
-
-Interpreting the output
-=======================
-
-The training output will be displayed in the terminal and simultaneously saved
-to the ``output.txt`` file in the current directory. Key performance metrics will
-also be extracted and appended to the ``perf_pyt_mpt30b_training.csv`` file.
-
-Key performance metrics include:
-
- Training logs: Real-time display of loss metrics, accuracy, and training progress.
-
- Model checkpoints: Periodically saved model snapshots for potential resume or evaluation.
-
- Performance metrics: Detailed summaries of training speed and training loss metrics.
-
-  - Performance (throughput/samples_per_sec)
-
-    Overall throughput, measuring the total samples processed per second. Higher values indicate better hardware utilization.
-
-  - Performance per device (throughput/samples_per_sec)
-
-    Throughput on a per-device basis, showing how each GPU or CPU is performing.
-
-  - Language Cross Entropy (metrics/train/LanguageCrossEntropy)
-
-    Measures prediction accuracy. Lower cross entropy suggests the model’s output is closer to the expected distribution.
-
-  - Training loss (loss/train/total)
-
-    Overall training loss. A decreasing trend indicates the model is learning effectively.
-
-
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -77,18 +77,11 @@ popular AI models.
 System validation
 =================

-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
 before starting training.

-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
 This Docker image is optimized for specific model configurations outlined
 below. Performance can vary for other training workloads, as AMD 
 doesn’t validate configurations and run conditions outside those described.
--- a/docs/how-to/rocm-for-ai/training/index.rst
+++ b/docs/how-to/rocm-for-ai/training/index.rst
@@ -21,12 +21,8 @@ In this guide, you'll learn about:

 - Training a model

-  - :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
+  - :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`

-  - :doc:`With PyTorch <benchmark-docker/pytorch-training>`
-
-  - :doc:`With JAX MaxText <benchmark-docker/jax-maxtext>`
-
-  - :doc:`With LLM Foundry <benchmark-docker/mpt-llm-foundry>`
+  - :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`

 - :doc:`Scaling model training <scale-model-training>`
--- a/docs/how-to/rocm-for-ai/training/prerequisite-system-validation.rst
+++ b/docs/how-to/rocm-for-ai/training/prerequisite-system-validation.rst
@@ -5,13 +5,12 @@
   :keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax

 .. _train-a-model-system-validation:
-.. _rocm-for-ai-system-optimization:

-**********************************************************
-Prerequisite system validation before running AI workloads
-**********************************************************
+**********************************************
+Prerequisite system validation before training
+**********************************************

-Complete the following system validation and optimization steps to set up your system before starting training and inference.
+Complete the following system validation and optimization steps to set up your system before starting training.

 Disable NUMA auto-balancing
 ---------------------------
@@ -27,8 +26,7 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.

   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'

-See `Disable NUMA auto-balancing <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_
-in the Instinct documentation for more information.
+See :ref:`mi300x-disable-numa` for more information.

 Hardware verification with ROCm
 -------------------------------
@@ -44,8 +42,7 @@ Run the command:

   rocm-smi --setperfdeterminism 1900

-See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
-in the Instinct documentation for more information.
+See :ref:`mi300x-hardware-verification-with-rocm` for more information.

 RCCL Bandwidth Test for multi-node setups
 -----------------------------------------
--- a/docs/reference/api-libraries.md
+++ b/docs/reference/api-libraries.md
@@ -45,7 +45,7 @@
 (communication-libraries)=

 * {doc}`RCCL <rccl:index>`
-* {doc}`rocSHMEM <rocshmem:index>`
+* [rocSHMEM](https://github.com/ROCm/rocSHMEM)
 :::

 :::{grid-item-card} Math
--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -281,31 +281,13 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - SGPR File (KiB)
          - GFXIP Major version
          - GFXIP Minor version
-        *
-          - Radeon AI PRO R9700
-          - RDNA4
-          - gfx1201
-          - 16
-          - 64
-          - 32 or 64
-          - 128
-          - 64
-          - 8
-          - N/A
-          - 32
-          - 16
-          - 32
-          - 768
-          - 32
-          - 12
-          - 0
        *
          - Radeon PRO V710
          - RDNA3
          - gfx1101
          - 28
          - 54
-          - 32 or 64
+          - 32
          - 128
          - 56
          - 4
@@ -314,7 +296,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -323,7 +305,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 48
          - 96
-          - 32 or 64
+          - 32
          - 128
          - 96
          - 6
@@ -332,7 +314,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -341,7 +323,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 48
          - 96
-          - 32 or 64
+          - 32
          - 128
          - 96
          - 6
@@ -350,7 +332,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -359,7 +341,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 48
          - 70
-          - 32 or 64
+          - 32
          - 128
          - 96
          - 6
@@ -368,7 +350,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -377,7 +359,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 32
          - 70
-          - 32 or 64
+          - 32
          - 128
          - 64
          - 6
@@ -386,7 +368,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -395,7 +377,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1101
          - 16
          - 48
-          - 32 or 64
+          - 32
          - 128
          - 64
          - 4
@@ -404,7 +386,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -413,7 +395,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 32
          - 60
-          - 32 or 64
+          - 32
          - 128
          - 128
          - 4
@@ -422,7 +404,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -431,7 +413,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1032
          - 8
          - 28
-          - 32 or 64
+          - 32
          - 128
          - 32
          - 2
@@ -440,7 +422,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -449,7 +431,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 32
          - 72
-          - 32 or 64
+          - 32
          - 128
          - 128
          - 4
@@ -458,7 +440,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -467,7 +449,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1012
          - 8
          - 22
-          - 32 or 64
+          - 32
          - 128
          -
          - 4
@@ -522,85 +504,13 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - SGPR File (KiB)
          - GFXIP Major version
          - GFXIP Minor version
-        *
-          - Radeon RX 9070 XT
-          - RDNA4
-          - gfx1201
-          - 16
-          - 64
-          - 32 or 64
-          - 128
-          - 64
-          - 8
-          - N/A
-          - 32
-          - 16
-          - 32
-          - 768
-          - 32
-          - 12
-          - 0
-        *
-          - Radeon RX 9070 GRE
-          - RDNA4
-          - gfx1201
-          - 16
-          - 48
-          - 32 or 64
-          - 128
-          - 48
-          - 6
-          - N/A
-          - 32
-          - 16
-          - 32
-          - 768
-          - 32
-          - 12
-          - 0
-        *
-          - Radeon RX 9070
-          - RDNA4
-          - gfx1201
-          - 16
-          - 56
-          - 32 or 64
-          - 128
-          - 64
-          - 8
-          - N/A
-          - 32
-          - 16
-          - 32
-          - 768
-          - 32
-          - 12
-          - 0
-        *
-          - Radeon RX 9060 XT
-          - RDNA4
-          - gfx1200
-          - 16
-          - 32
-          - 32 or 64
-          - 128
-          - 32
-          - 4
-          - N/A
-          - 32
-          - 16
-          - 32
-          - 768
-          - 32
-          - 12
-          - 0
        *
          - Radeon RX 7900 XTX
          - RDNA3
          - gfx1100
          - 24
          - 96
-          - 32 or 64
+          - 32
          - 128
          - 96
          - 6
@@ -609,7 +519,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -618,7 +528,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 20
          - 84
-          - 32 or 64
+          - 32
          - 128
          - 80
          - 6
@@ -627,7 +537,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -636,7 +546,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1100
          - 16
          - 80
-          - 32 or 64
+          - 32
          - 128
          - 64
          - 6
@@ -645,7 +555,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -654,7 +564,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1101
          - 16
          - 60
-          - 32 or 64
+          - 32
          - 128
          - 64
          - 4
@@ -663,7 +573,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -672,7 +582,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1101
          - 12
          - 54
-          - 32 or 64
+          - 32
          - 128
          - 48
          - 4
@@ -681,7 +591,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 768
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -690,7 +600,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1102
          - 8
          - 32
-          - 32 or 64
+          - 32
          - 128
          - 32
          - 2
@@ -699,7 +609,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 11
          - 0
        *
@@ -708,7 +618,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 16
          - 80
-          - 32 or 64
+          - 32
          - 128
          - 128
          - 4
@@ -717,7 +627,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -726,7 +636,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 16
          - 80
-          - 32 or 64
+          - 32
          - 128
          - 128
          - 4
@@ -735,7 +645,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -744,7 +654,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 16
          - 72
-          - 32 or 64
+          - 32
          - 128
          - 128
          - 4
@@ -753,7 +663,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -762,7 +672,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1030
          - 16
          - 60
-          - 32 or 64
+          - 32
          - 128
          - 128
          - 4
@@ -771,7 +681,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -780,7 +690,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1031
          - 12
          - 40
-          - 32 or 64
+          - 32
          - 128
          - 96
          - 3
@@ -789,7 +699,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -798,7 +708,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1031
          - 12
          - 40
-          - 32 or 64
+          - 32
          - 128
          - 96
          - 3
@@ -807,7 +717,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -816,7 +726,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1031
          - 10
          - 36
-          - 32 or 64
+          - 32
          - 128
          - 80
          - 3
@@ -825,7 +735,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -834,7 +744,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1032
          - 8
          - 32
-          - 32 or 64
+          - 32
          - 128
          - 32
          - 2
@@ -843,7 +753,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -852,7 +762,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1032
          - 8
          - 32
-          - 32 or 64
+          - 32
          - 128
          - 32
          - 2
@@ -861,7 +771,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
@@ -870,7 +780,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - gfx1032
          - 8
          - 28
-          - 32 or 64
+          - 32
          - 128
          - 32
          - 2
@@ -879,7 +789,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - 16
          - 32
          - 512
-          - 32
+          - 16
          - 10
          - 3
        *
--- a/docs/release/versions.md
+++ b/docs/release/versions.md
@@ -10,7 +10,6 @@

 | Version | Release date |
 | ------- | ------------ |
-| [6.4.1](https://rocm.docs.amd.com/en/docs-6.4.1/) | May 21, 2025 |
 | [6.4.0](https://rocm.docs.amd.com/en/docs-6.4.0/) | April 11, 2025 |
 | [6.3.3](https://rocm.docs.amd.com/en/docs-6.3.3/) | February 19, 2025 |
 | [6.3.2](https://rocm.docs.amd.com/en/docs-6.3.2/) | January 28, 2025 |
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -12,14 +12,14 @@ subtrees:
  - file: compatibility/compatibility-matrix.rst
    title: Compatibility matrix
    entries:
-    - url: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html
+    - url: https://rocm.docs.amd.com/projects/install-on-linux-internal/en/latest/reference/system-requirements.html
      title: Linux system requirements
    - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
      title: Windows system requirements

 - caption: Install
  entries:
-  - url: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/
+  - url: https://rocm.docs.amd.com/projects/install-on-linux-internal/en/latest/
    title: ROCm on Linux
  - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/
    title: HIP SDK on Windows
@@ -36,22 +36,16 @@ subtrees:
    title: Use ROCm for AI
    subtrees:
    - entries:
-      - file: how-to/rocm-for-ai/install.rst
-        title: Installation
-      - file: how-to/rocm-for-ai/system-health-check.rst
-        title: System health benchmarks
      - file: how-to/rocm-for-ai/training/index.rst
        title: Training
        subtrees:
        - entries:
-          - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+          - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm
            title: Train a model with Megatron-LM
-          - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+          - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training
            title: Train a model with PyTorch
-          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext
            title: Train a model with JAX MaxText
-          - file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
-            title: Train a model with LLM Foundry
          - file: how-to/rocm-for-ai/training/scale-model-training.rst
            title: Scale model training

@@ -74,13 +68,15 @@ subtrees:
        title: Inference
        subtrees:
        - entries:
+          - file: how-to/rocm-for-ai/inference/install.rst
+            title: Installation
          - file: how-to/rocm-for-ai/inference/hugging-face-models.rst
            title: Run models from Hugging Face
          - file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
            title: LLM inference frameworks
-          - file: how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+          - file: how-to/rocm-for-ai/inference/vllm-benchmark.rst
            title: vLLM inference performance testing
-          - file: how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+          - file: how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst
            title: PyTorch inference performance testing
          - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
            title: Deploy your model
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,4 +1,4 @@
-rocm-docs-core==1.20.1
+rocm-docs-core==1.18.2
 sphinx-reredirects
 sphinx-sitemap
 sphinxcontrib.datatemplates==0.11.0
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile requirements.in
+#    pip-compile docs/sphinx/requirements.in
 #
 accessible-pygments==0.0.5
    # via pydata-sphinx-theme
@@ -10,73 +10,74 @@ alabaster==1.0.0
    # via sphinx
 asttokens==3.0.0
    # via stack-data
-attrs==25.3.0
+attrs==25.1.0
    # via
    #   jsonschema
    #   jupyter-cache
    #   referencing
-babel==2.17.0
+babel==2.16.0
    # via
    #   pydata-sphinx-theme
    #   sphinx
-beautifulsoup4==4.13.4
+beautifulsoup4==4.12.3
    # via pydata-sphinx-theme
-breathe==4.36.0
+breathe==4.35.0
    # via rocm-docs-core
-certifi==2025.4.26
+certifi==2024.8.30
    # via requests
 cffi==1.17.1
    # via
    #   cryptography
    #   pynacl
-charset-normalizer==3.4.2
+charset-normalizer==3.4.0
    # via requests
-click==8.2.1
+click==8.1.7
    # via
    #   jupyter-cache
    #   sphinx-external-toc
 comm==0.2.2
    # via ipykernel
-cryptography==45.0.3
+cryptography==44.0.1
    # via pyjwt
-debugpy==1.8.14
+debugpy==1.8.12
    # via ipykernel
-decorator==5.2.1
+decorator==5.1.1
    # via ipython
 defusedxml==0.7.1
    # via sphinxcontrib-datatemplates
-deprecated==1.2.18
+deprecated==1.2.15
    # via pygithub
 docutils==0.21.2
    # via
+    #   breathe
    #   myst-parser
    #   pydata-sphinx-theme
    #   sphinx
-exceptiongroup==1.3.0
+exceptiongroup==1.2.2
    # via ipython
 executing==2.2.0
    # via stack-data
-fastjsonschema==2.21.1
+fastjsonschema==2.20.0
    # via
    #   nbformat
    #   rocm-docs-core
-gitdb==4.0.12
+gitdb==4.0.11
    # via gitpython
-gitpython==3.1.44
+gitpython==3.1.43
    # via rocm-docs-core
-greenlet==3.2.3
+greenlet==3.1.1
    # via sqlalchemy
 idna==3.10
    # via requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==8.7.0
+importlib-metadata==8.6.1
    # via
    #   jupyter-cache
    #   myst-nb
 ipykernel==6.29.5
    # via myst-nb
-ipython==8.37.0
+ipython==8.31.0
    # via
    #   ipykernel
    #   myst-nb
@@ -86,9 +87,9 @@ jinja2==3.1.6
    # via
    #   myst-parser
    #   sphinx
-jsonschema==4.24.0
+jsonschema==4.23.0
    # via nbformat
-jsonschema-specifications==2025.4.1
+jsonschema-specifications==2024.10.1
    # via jsonschema
 jupyter-cache==1.0.1
    # via myst-nb
@@ -96,7 +97,7 @@ jupyter-client==8.6.3
    # via
    #   ipykernel
    #   nbclient
-jupyter-core==5.8.1
+jupyter-core==5.7.2
    # via
    #   ipykernel
    #   jupyter-client
@@ -116,9 +117,9 @@ mdit-py-plugins==0.4.2
    # via myst-parser
 mdurl==0.1.2
    # via markdown-it-py
-myst-nb==1.2.0
+myst-nb==1.1.2
    # via rocm-docs-core
-myst-parser==4.0.1
+myst-parser==4.0.0
    # via myst-nb
 nbclient==0.10.2
    # via
@@ -131,20 +132,19 @@ nbformat==5.10.4
    #   nbclient
 nest-asyncio==1.6.0
    # via ipykernel
-packaging==25.0
+packaging==24.2
    # via
    #   ipykernel
-    #   pydata-sphinx-theme
    #   sphinx
 parso==0.8.4
    # via jedi
 pexpect==4.9.0
    # via ipython
-platformdirs==4.3.8
+platformdirs==4.3.6
    # via jupyter-core
-prompt-toolkit==3.0.51
+prompt-toolkit==3.0.50
    # via ipython
-psutil==7.0.0
+psutil==6.1.1
    # via ipykernel
 ptyprocess==0.7.0
    # via pexpect
@@ -152,19 +152,19 @@ pure-eval==0.2.3
    # via stack-data
 pycparser==2.22
    # via cffi
-pydata-sphinx-theme==0.15.4
+pydata-sphinx-theme==0.16.0
    # via
    #   rocm-docs-core
    #   sphinx-book-theme
-pygithub==2.6.1
+pygithub==2.5.0
    # via rocm-docs-core
-pygments==2.19.1
+pygments==2.18.0
    # via
    #   accessible-pygments
    #   ipython
    #   pydata-sphinx-theme
    #   sphinx
-pyjwt[crypto]==2.10.1
+pyjwt[crypto]==2.10.0
    # via pygithub
 pynacl==1.5.0
    # via pygithub
@@ -178,7 +178,7 @@ pyyaml==6.0.2
    #   rocm-docs-core
    #   sphinx-external-toc
    #   sphinxcontrib-datatemplates
-pyzmq==26.4.0
+pyzmq==26.2.0
    # via
    #   ipykernel
    #   jupyter-client
@@ -186,23 +186,23 @@ referencing==0.36.2
    # via
    #   jsonschema
    #   jsonschema-specifications
-requests==2.32.4
+requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.20.1
+rocm-docs-core==1.18.2
    # via -r requirements.in
-rpds-py==0.25.1
+rpds-py==0.22.3
    # via
    #   jsonschema
    #   referencing
 six==1.17.0
    # via python-dateutil
-smmap==5.0.2
+smmap==5.0.1
    # via gitdb
-snowballstemmer==3.0.1
+snowballstemmer==2.2.0
    # via sphinx
-soupsieve==2.7
+soupsieve==2.6
    # via beautifulsoup4
 sphinx==8.1.3
    # via
@@ -220,7 +220,7 @@ sphinx==8.1.3
    #   sphinx-sitemap
    #   sphinxcontrib-datatemplates
    #   sphinxcontrib-runcmd
-sphinx-book-theme==1.1.4
+sphinx-book-theme==1.1.3
    # via rocm-docs-core
 sphinx-copybutton==0.5.2
    # via rocm-docs-core
@@ -228,7 +228,7 @@ sphinx-design==0.6.1
    # via rocm-docs-core
 sphinx-external-toc==1.0.1
    # via rocm-docs-core
-sphinx-notfound-page==1.1.0
+sphinx-notfound-page==1.0.4
    # via rocm-docs-core
 sphinx-reredirects==0.1.6
    # via -r requirements.in
@@ -250,13 +250,13 @@ sphinxcontrib-runcmd==0.2.0
    # via sphinxcontrib-datatemplates
 sphinxcontrib-serializinghtml==2.0.0
    # via sphinx
-sqlalchemy==2.0.41
+sqlalchemy==2.0.37
    # via jupyter-cache
 stack-data==0.6.3
    # via ipython
 tabulate==0.9.0
    # via jupyter-cache
-tomli==2.2.1
+tomli==2.1.0
    # via sphinx
 tornado==6.4.2
    # via
@@ -272,23 +272,21 @@ traitlets==5.14.3
    #   matplotlib-inline
    #   nbclient
    #   nbformat
-typing-extensions==4.14.0
+typing-extensions==4.12.2
    # via
-    #   beautifulsoup4
-    #   exceptiongroup
    #   ipython
    #   myst-nb
    #   pydata-sphinx-theme
    #   pygithub
    #   referencing
    #   sqlalchemy
-urllib3==2.4.0
+urllib3==2.2.3
    # via
    #   pygithub
    #   requests
 wcwidth==0.2.13
    # via prompt-toolkit
-wrapt==1.17.2
+wrapt==1.17.0
    # via deprecated
-zipp==3.23.0
+zipp==3.21.0
    # via importlib-metadata
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -52,7 +52,7 @@ Communication
  :header: "Component", "Description"

  ":doc:`RCCL <rccl:index>`", "Standalone library that provides multi-GPU and multi-node collective communication primitives"
-  ":doc:`rocSHMEM <rocshmem:index>`", "An intra-kernel networking library that provides GPU-centric networking through an OpenSHMEM-like interface"
+  "`rocSHMEM <https://github.com/ROCm/rocSHMEM>`_", "Runtime that provides GPU-centric networking through an OpenSHMEM-like interface. This intra-kernel networking library simplifies application code complexity and enables more fine-grained communication/computation overlap than traditional host-driven networking."

 Math
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -117,11 +117,6 @@ Performance
  ":doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`", "Toolkit for developing analysis tools for profiling and tracing GPU compute applications. This toolkit is in beta and subject to change"
  ":doc:`ROCTracer <roctracer:index>`", "Intercepts runtime API calls and traces asynchronous activity"

-.. note::
-
-  `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`.
-  Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
-
 Development
 ^^^^^^^^^^^

--- a/tools/autotag/components.xml
+++ b/tools/autotag/components.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.1"
+    <default revision="refs/tags/rocm-6.4.0"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
--- a/tools/rocm-build/ROCm.mk
+++ b/tools/rocm-build/ROCm.mk
@@ -87,6 +87,7 @@ endef

 $(call adddep,amd_smi_lib,${ASAN_DEP})
 $(call adddep,aqlprofile,${ASAN_DEP} rocr)
+$(call adddep,aqlprofiletest,lightning rocminfo aqlprofile opencl_on_rocclr hip_on_rocclr)
 $(call adddep,comgr,lightning devicelibs)
 $(call adddep,dbgapi,rocr comgr)
 $(call adddep,devicelibs,lightning)
@@ -114,7 +115,7 @@ $(call adddep,roctracer,${ASAN_DEP} rocr hip_on_rocclr)


 # rocm-dev points to all possible last finish components of Stage1 build.
-rocm-dev-components :=amd_smi_lib aqlprofile comgr dbgapi devicelibs hip_on_rocclr hipcc hipify_clang \
+rocm-dev-components :=amd_smi_lib aqlprofile aqlprofiletest comgr dbgapi devicelibs hip_on_rocclr hipcc hipify_clang \
 	lightning rocprofiler-compute opencl_on_rocclr openmp_extras rocm_bandwidth_test rocm_smi_lib \
 	rocm-cmake rocm-core rocm-gdb rocminfo rocprofiler-register rocprofiler-sdk rocprofiler-systems \
 	rocprofiler rocr rocr_debug_agent rocrsamples roctracer
--- a/tools/rocm-build/build_rocr.sh
+++ b/tools/rocm-build/build_rocr.sh
@@ -255,8 +255,8 @@ print_output_directory() {
 # Common variables
 target="build"

-kfdtest_target="no"
-rocrtst_target="no"
+kfdtest_target="yes"
+rocrtst_target="yes"
 rocr_target="ON"

 package_root="$(getPackageRoot)"
--- a/tools/rocm-build/docker/ubuntu22/install-prerequisities.sh
+++ b/tools/rocm-build/docker/ubuntu22/install-prerequisities.sh
--- a/tools/rocm-build/docker/ubuntu22/packages
+++ b/tools/rocm-build/docker/ubuntu22/packages
@@ -60,6 +60,7 @@ libfile-find-rule-perl
 libgflags-dev
 libglew-dev
 libgmp-dev
+libgoogle-glog-dev
 libgtk2.0-dev
 libhdf5-serial-dev
 libjpeg-dev
@@ -89,6 +90,7 @@ libsuitesparse-dev
 libsystemd-dev
 libtinfo-dev
 libtool
+libunwind-dev
 liburi-encode-perl
 libva-dev
 libvirt-clients
@@ -96,6 +98,7 @@ libvirt-daemon-system
 libyaml-cpp-dev
 libzstd-dev
 llvm
+llvm-6.0-dev
 llvm-dev
 llvm-runtime
 mesa-common-dev
@@ -109,7 +112,8 @@ pigz
 pkg-config
 protobuf-compiler
 python-is-python3
-python3-pip-whl
+python-pip-whl
+python-yaml
 python3-dev
 python3-pip
 python3-venv
--- a/tools/rocm-build/docker/ubuntu24/install-prerequisites.sh
+++ b/tools/rocm-build/docker/ubuntu24/install-prerequisites.sh
@@ -17,7 +17,7 @@ git --version

 # venv for python to be able to run pip3 without --break-system-packages
 python3 -m venv /opt/venv
-source /opt/venv/bin/activate
+
 pip3 install CppHeaderParser argparse lxml recommonmark jinja2==3.0.0 \
    websockets matplotlib numpy scipy minimal msgpack pytest sphinx joblib PyYAML rocm-docs-core cmake==3.25.2 pandas \
    myst-parser setuptools lit
--- a/tools/rocm-build/envsetup.sh
+++ b/tools/rocm-build/envsetup.sh
@@ -217,7 +217,7 @@ export RCCL_ROOT=$WORK_ROOT/rccl
 export ROCM_DBGAPI_ROOT=$WORK_ROOT/ROCdbgapi
 export ROCM_GDB_ROOT=$WORK_ROOT/ROCgdb
 # export ROCclr_ROOT=$WORK_ROOT/vdi
-export HIP_ON_ROCclr_ROOT=$WORK_ROOT/hip
+export HIP_ON_ROCclr_ROOT=$WORK_ROOT/HIP
 export HIPAMD_ROOT=$WORK_ROOT/hipamd
 export HIP_CATCH_TESTS_ROOT=$WORK_ROOT/hip-tests
 # export OPENCL_ON_ROCclr_ROOT=$WORK_ROOT/opencl-on-vdi
--- a/tools/rocm-build/rocm-6.4.1.xml
+++ b/tools/rocm-build/rocm-6.4.1.xml
@@ -1,79 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<manifest>
-    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.1"
-     remote="rocm-org"
-     sync-c="true"
-     sync-j="4" />
-<!--list of projects for ROCm-->
-    <project name="ROCm" revision="roc-6.4.x" />
-    <project name="ROCK-Kernel-Driver" />
-    <project name="ROCR-Runtime" />
-    <project name="amdsmi" />
-    <project name="rdc" />
-    <project name="rocm_bandwidth_test" />
-    <project name="rocm_smi_lib" />
-    <project name="rocm-core" />
-    <project name="rocm-examples" />
-    <project name="rocminfo" />
-    <project name="rocprofiler" />
-    <project name="rocprofiler-register" />
-    <project name="rocprofiler-sdk" />
-    <project name="rocprofiler-compute" />
-    <project name="rocprofiler-systems" />
-    <project name="roctracer" />
-<!--HIP Projects-->
-    <project name="HIP" />
-    <project name="hip-tests" />
-    <project name="HIPIFY" />
-    <project name="clr" />
-    <project name="hipother" />
-<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
-    <project name="half" />
-    <project name="llvm-project" />
-    <project name="spirv-llvm-translator" />
-<!-- gdb projects -->
-    <project name="ROCdbgapi" />
-    <project name="ROCgdb" />
-    <project name="rocr_debug_agent" />
-<!-- ROCm Libraries -->
-    <project groups="mathlibs" name="AMDMIGraphX" />
-    <project groups="mathlibs" name="MIOpen" />
-    <project groups="mathlibs" name="MIVisionX" />
-    <project groups="mathlibs" name="ROCmValidationSuite" />
-    <project groups="mathlibs" name="Tensile" />
-    <project groups="mathlibs" name="composable_kernel" />
-    <project groups="mathlibs" name="hipBLAS-common" />
-    <project groups="mathlibs" name="hipBLAS" />
-    <project groups="mathlibs" name="hipBLASLt" />
-    <project groups="mathlibs" name="hipCUB" />
-    <project groups="mathlibs" name="hipFFT" />
-    <project groups="mathlibs" name="hipRAND" />
-    <project groups="mathlibs" name="hipSOLVER" />
-    <project groups="mathlibs" name="hipSPARSE" />
-    <project groups="mathlibs" name="hipSPARSELt" />
-    <project groups="mathlibs" name="hipTensor" />
-    <project groups="mathlibs" name="hipfort" />
-    <project groups="mathlibs" name="rccl" />
-    <project groups="mathlibs" name="rocAL" />
-    <project groups="mathlibs" name="rocALUTION" />
-    <project groups="mathlibs" name="rocBLAS" />
-    <project groups="mathlibs" name="rocDecode" />
-    <project groups="mathlibs" name="rocJPEG" />
-    <project groups="mathlibs" name="rocPyDecode" />
-    <project groups="mathlibs" name="rocFFT" />
-    <project groups="mathlibs" name="rocPRIM" />
-    <project groups="mathlibs" name="rocRAND" />
-    <project groups="mathlibs" name="rocSHMEM" />
-    <project groups="mathlibs" name="rocSOLVER" />
-    <project groups="mathlibs" name="rocSPARSE" />
-    <project groups="mathlibs" name="rocThrust" />
-    <project groups="mathlibs" name="rocWMMA" />
-    <project groups="mathlibs" name="rocm-cmake" />
-    <project groups="mathlibs" name="rpp" />
-    <project groups="mathlibs" name="TransferBench" />
-<!-- Projects for OpenMP-Extras -->
-    <project name="aomp" path="openmp-extras/aomp" />
-    <project name="aomp-extras" path="openmp-extras/aomp-extras" />
-    <project name="flang" path="openmp-extras/flang" />
-</manifest>
Author	SHA1	Message	Date
Peter Park	54ba8bfed1	update group name	2025-05-29 10:41:53 -04:00
Peter Park	55e13a3c38	add Falcon to vllm-benchmark-models.yaml	2025-05-08 14:13:03 -04:00