adding individual libraries

2026-01-10 07:08:08 -05:00 · 2025-09-16 09:29:55 +00:00
190 changed files with 4904 additions and 21018 deletions
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -128,9 +128,6 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.28.6'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -155,7 +152,6 @@ jobs:
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -DAMDGPU_TARGETS=${{ job.target }}
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
@@ -196,9 +192,6 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.28.6'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -224,7 +217,6 @@ jobs:
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -DAMDGPU_TARGETS=${{ job.target }}
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -79,7 +79,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - task: Bash@3
      displayName: Add lit to PATH
      inputs:
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -131,7 +131,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -212,7 +212,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -37,7 +37,6 @@ parameters:
    - libdrm-dev
    - libelf-dev
    - libnuma-dev
-    - libsimde-dev
    - ninja-build
    - pkg-config
 - name: rocmDependencies
--- a/.azuredevops/components/amdsmi.yml
+++ b/.azuredevops/components/amdsmi.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: amdsmi
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -50,7 +31,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+  - job: amdsmi_build_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -74,7 +55,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -85,54 +65,50 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
-        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        parameters:
-          runRocminfo: false
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)'
-          testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
-          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: amdsmi_test_${{ job.os }}_${{ job.target }}
+    dependsOn: amdsmi_build_${{ job.os }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      parameters:
+        runRocminfo: false
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: amdsmi
+        testDir: '$(Agent.BuildDirectory)'
+        testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
+        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/aqlprofile.yml
+++ b/.azuredevops/components/aqlprofile.yml
@@ -1,174 +0,0 @@
-parameters:
- name: componentName
-  type: string
-  default: aqlprofile
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
-# set to true if doing full build of ROCm stack
-# and dependencies are pulled from same pipeline
- name: aggregatePipeline
-  type: boolean
-  default: false
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-    - python3-pip
- name: rocmDependencies
-  type: object
-  default:
-    - clr
-    - llvm-project
-    - ROCR-Runtime
- name: rocmTestDependencies
-  type: object
-  default:
-    - clr
-    - llvm-project
-    - ROCR-Runtime
-    - rocprofiler-register
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/aqlprofile/cmake_modules
-          -DAQLPROFILE_BUILD_TESTS=ON
-          -DGPU_TARGETS=${{ job.target }}
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          gpuTarget: ${{ job.target }}
-
- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/rocm/share/hsa-amd-aqlprofile/
-          testExecutable: ./run_tests.sh
-          testParameters: ''
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -77,7 +77,6 @@ parameters:
    - clr
    - hipBLAS-common
    - llvm-project
-    - rocm-cmake
    - rocminfo
    - rocm_smi_lib
    - rocprofiler-register
@@ -145,7 +144,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -40,7 +40,6 @@ parameters:
    - gfortran
    - libgfortran5
    - libopenblas-dev
-    - liblapack-dev
 - name: pipModules
  type: object
  default:
@@ -54,7 +53,6 @@ parameters:
    - hipSPARSE
    - llvm-project
    - rocBLAS
-    - rocm-cmake
    - rocm_smi_lib
    - rocminfo
    - rocprofiler-register
@@ -68,7 +66,6 @@ parameters:
    - llvm-project
    - hipBLAS-common
    - hipBLASLt
-    - rocm-cmake
    - rocBLAS
    - rocminfo
    - rocprofiler-register
@@ -112,7 +109,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -128,13 +125,10 @@ jobs:
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-  # NOTE: content between `---` is for transition support between old/new build systems
-  # and should be removed once transition is complete.
-  # -----------------------------
  # Build and install gtest and lapack
  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
  # $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
-    - script: mkdir -p $(Pipeline.Workspace)/deps
+    - script: mkdir $(Pipeline.Workspace)/deps
      displayName: Create temp folder for external dependencies
  # hipSPARSELt already has a CMake script for external deps, so we can just run that
  # https://github.com/ROCm/hipSPARSELt/blob/develop/deps/CMakeLists.txt
@@ -150,35 +144,22 @@ jobs:
    - script: sudo make install
      displayName: Install hipSPARSELt external dependencies
      workingDirectory: $(Pipeline.Workspace)/deps
-  # -----------------------------
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
-        # NOTE: the following options are old build only 
-        # and can be removed after full transition to new build
-        # -DAMDGPU_TARGETS=${{ job.target }}
-        # -DCMAKE_Fortran_COMPILER=f95
-        # -DTensile_LOGIC=
-        # -DTensile_CPU_THREADS=
-        # -DTensile_LIBRARY_FORMAT=msgpack
-        # -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-        # -DBUILD_CLIENTS_TESTS=ON
-        # -DBUILD_USE_LOCAL_TENSILE=OFF
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
-          -DGPU_TARGETS=${{ job.target }}
-          -DAMDGPU_TARGETS=${{ job.target }}
          -DCMAKE_Fortran_COMPILER=f95
+          -DAMDGPU_TARGETS=${{ job.target }}
          -DTensile_LOGIC=
          -DTensile_CPU_THREADS=
          -DTensile_LIBRARY_FORMAT=msgpack
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_CLIENTS_TESTS=ON
          -DBUILD_USE_LOCAL_TENSILE=OFF
-          -DHIPSPARSELT_ENABLE_FETCH=ON
          -GNinja
        ${{ if ne(parameters.sparseCheckoutDir, '') }}:
          cmakeSourceDir: $(Build.SourcesDirectory)/projects/hipsparselt
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: hipTensor
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -70,7 +51,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: hipTensor_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -85,21 +66,17 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/llvm
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_BUILD_TYPE=Release
          -DHIPTENSOR_BUILD_TESTS=ON
@@ -107,12 +84,9 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -120,47 +94,44 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        gpuTarget: ${{ job.target }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      timeoutInMinutes: 90
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
-          testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: hipTensor_test_${{ job.target }}
+    timeoutInMinutes: 90
+    dependsOn: hipTensor_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: hipTensor
+        testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
+        testParameters: '-E ".*-extended" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hipfort.yml
+++ b/.azuredevops/components/hipfort.yml
@@ -71,7 +71,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -39,9 +39,6 @@ parameters:
    - python3
    - python3-dev
    - python3-pip
-    - libgtest-dev
-    - libboost-filesystem-dev
-    - libboost-program-options-dev
 - name: pipModules
  type: object
  default:
@@ -110,12 +107,8 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -132,7 +125,7 @@ jobs:
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DORIGAMI_BUILD_SHARED_LIBS=ON
          -DORIGAMI_ENABLE_PYTHON=ON
@@ -213,15 +206,7 @@ jobs:
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
-          testExecutable: './origami-tests'
-          testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
      - script: |
-          set -e
          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH

          echo "--- Running origami_test.py ---"
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -1,35 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rccl
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
- name: systemsRepo
-  type: string
-  default: systems_repo
- name: systemsSparseCheckoutDir
-  type: string
-  default: 'projects/rocprofiler-sdk'
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -82,52 +57,37 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
- name: downstreamComponentMatrix
-  type: object
-  default:
-    - rocprofiler-sdk:
-      name: rocprofiler-sdk
-      sparseCheckoutDir: ''
-      skipUnifiedBuild: 'false'
-      buildDependsOn:
-        - rccl_build
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
-    timeoutInMinutes: 120
+  - job: rccl_build_${{ job.target }}
+    timeoutInMinutes: 90
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        submoduleBehaviour: recursive
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
@@ -137,14 +97,10 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
@@ -156,87 +112,58 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          gpuTarget: ${{ job.target }}
-          extraEnvVars:
-            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          installLatestCMake: true
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        gpuTarget: ${{ job.target }}
+        extraEnvVars:
+          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+        installLatestCMake: true

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      timeoutInMinutes: 120
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
-          testExecutable: './rccl-UnitTests'
-          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-
- ${{ if parameters.triggerDownstreamJobs }}:
-  - ${{ each component in parameters.downstreamComponentMatrix }}:
-    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-        parameters:
-          checkoutRepo: ${{ parameters.systemsRepo }}
-          sparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }}
-          triggerDownstreamJobs: true
-          unifiedBuild: ${{ parameters.unifiedBuild }}
-          ${{ if parameters.unifiedBuild }}:
-            buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
-          ${{ else }}:
-            buildDependsOn: ${{ component.buildDependsOn }}
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rccl_test_${{ job.target }}
+    timeoutInMinutes: 120
+    dependsOn: rccl_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rccl
+        testDir: '$(Agent.BuildDirectory)/rocm/bin'
+        testExecutable: './rccl-UnitTests'
+        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rdc.yml
+++ b/.azuredevops/components/rdc.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rdc
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -52,7 +33,6 @@ parameters:
    - clr
    - hipBLAS-common
    - hipBLASLt
-    - hipRAND
    - llvm-project
    - rocBLAS
    - rocm-cmake
@@ -63,7 +43,6 @@ parameters:
    - rocprofiler
    - rocprofiler-register
    - rocprofiler-sdk
-    - rocRAND
    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
@@ -95,11 +74,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }}
+  - job: rdc_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -110,22 +85,16 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.25.0'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
  # Build grpc
    - task: Bash@3
      displayName: 'git clone grpc'
@@ -135,7 +104,6 @@ jobs:
        workingDirectory: $(Build.SourcesDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
        cmakeSourceDir: $(Build.SourcesDirectory)/grpc
        installDir: $(Build.SourcesDirectory)/bin
@@ -149,7 +117,6 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DGRPC_ROOT="$(Build.SourcesDirectory)/bin"
@@ -159,12 +126,9 @@ jobs:
          -DAMDGPU_TARGETS=${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -172,64 +136,60 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        gpuTarget: ${{ job.target }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      - name: ROCM_PATH
-        value: $(Agent.BuildDirectory)/rocm
-      - name: ROCM_DIR
-        value: $(Agent.BuildDirectory)/rocm
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - task: Bash@3
-        displayName: Setup test environment
-        inputs:
-          targetType: inline
-          script: |
-            sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
-            echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
-            sudo ldconfig -v
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - task: Bash@3
-        displayName: Test rdc
-        inputs:
-          targetType: inline
-          script: >-
-            $(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
-            --batch_mode
-            --start_rdcd
-            --unauth_comm
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-          extraPaths: /home/user/workspace/rocm/bin
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rdc_test_${{ job.target }}
+    dependsOn: rdc_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    - name: ROCM_DIR
+      value: $(Agent.BuildDirectory)/rocm
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - task: Bash@3
+      displayName: Setup test environment
+      inputs:
+        targetType: inline
+        script: |
+          sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
+          echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
+          sudo ldconfig -v
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - task: Bash@3
+      displayName: Test rdc
+      inputs:
+        targetType: inline
+        script: >-
+          $(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
+          --batch_mode
+          --start_rdcd
+          --unauth_comm
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
+        extraPaths: /home/user/workspace/rocm/bin
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -70,7 +70,6 @@ parameters:
    - hipBLAS-common
    - hipBLASLt
    - llvm-project
-    - rocm-cmake
    - rocminfo
    - rocprofiler-register
    - rocm_smi_lib
@@ -155,7 +154,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -210,7 +210,7 @@ jobs:
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
-          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
+          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }} -E device_merge_inplace'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
--- a/.azuredevops/components/rocWMMA.yml
+++ b/.azuredevops/components/rocWMMA.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocWMMA
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -85,11 +66,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }}
+  - job: rocWMMA_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -104,7 +81,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -126,12 +102,9 @@ jobs:
  # gfx1030 not supported in documentation
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -139,45 +112,43 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        gpuTarget: ${{ job.target }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      timeoutInMinutes: 350
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocWMMA_test_${{ job.target }}
+    timeoutInMinutes: 270
+    dependsOn: rocWMMA_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocWMMA
+        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocm-cmake.yml
+++ b/.azuredevops/components/rocm-cmake.yml
@@ -81,7 +81,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocm-cmake
-        testParameters: '-E "pass-version-parent" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+        testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -14,41 +14,16 @@ parameters:
  type: object
  default:
    - cmake
-    - libdw-dev
    - libglfw3-dev
    - libmsgpack-dev
-    - libopencv-dev
    - libtbb-dev
-    - libtiff-dev
-    - libva-amdgpu-dev
-    - libva2-amdgpu
-    - mesa-amdgpu-va-drivers
-    - libavcodec-dev
-    - libavformat-dev
-    - libavutil-dev
    - ninja-build
    - python3-pip
-    - protobuf-compiler
-    - libprotoc-dev
- name: pipModules
-  type: object
-  default:
-    - future==1.0.0
-    - pytz==2022.1
-    - numpy==1.23
-    - google==3.0.0
-    - protobuf==3.12.4
-    - onnx==1.12.0
-    - nnef==1.0.7
 - name: rocmDependencies
  type: object
  default:
    - AMDMIGraphX
-    - aomp
-    - aomp-extras
    - clr
-    - half
-    - composable_kernel
    - hipBLAS
    - hipBLAS-common
    - hipBLASLt
@@ -58,36 +33,21 @@ parameters:
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - hipTensor
    - llvm-project
-    - MIOpen
-    - MIVisionX
-    - rocm_smi_lib
-    - rccl
-    - rocALUTION
    - rocBLAS
-    - rocDecode
    - rocFFT
-    - rocJPEG
    - rocPRIM
    - rocprofiler-register
-    - rocprofiler-sdk
    - ROCR-Runtime
    - rocRAND
    - rocSOLVER
    - rocSPARSE
    - rocThrust
-    - rocWMMA
-    - rpp
 - name: rocmTestDependencies
  type: object
  default:
    - AMDMIGraphX
-    - aomp
-    - aomp-extras
    - clr
-    - half
-    - composable_kernel
    - hipBLAS
    - hipBLAS-common
    - hipBLASLt
@@ -97,29 +57,18 @@ parameters:
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - hipTensor
    - llvm-project
-    - MIOpen
-    - MIVisionX
-    - rocm_smi_lib
-    - rccl
-    - rocALUTION
    - rocBLAS
-    - rocDecode
    - rocFFT
    - rocminfo
    - rocPRIM
-    - rocJPEG
    - rocprofiler-register
-    - rocprofiler-sdk
    - ROCR-Runtime
    - rocRAND
    - rocSOLVER
    - rocSPARSE
    - rocThrust
    - roctracer
-    - rocWMMA
-    - rpp

 - name: jobMatrix
  type: object
@@ -148,11 +97,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.25.0'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -214,10 +158,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.25.0'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -248,6 +188,5 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
        environment: test
        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocm-libraries.yml
+++ b/.azuredevops/components/rocm-libraries.yml
@@ -43,14 +43,9 @@ parameters:
    - ninja-build
    - python3-pip
    - python3-venv
-    - googletest
-    - libgtest-dev
-    - libgmock-dev
-    - libboost-filesystem-dev
 - name: pipModules
  type: object
  default:
-    - msgpack
    - joblib
    - "packaging>=22.0"
    - pytest
@@ -107,7 +102,7 @@ jobs:
    workspace:
      clean: all
    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
@@ -152,13 +147,6 @@ jobs:
          echo "##vso[task.prependpath]$USER_BASE/bin"
          echo "##vso[task.setvariable variable=PytestCmakePath]$USER_BASE/share/Pytest/cmake"
        displayName: Set cmake configure paths
-    - task: Bash@3
-      displayName: Add ROCm binaries to PATH
-      inputs:
-        targetType: inline
-        script: |
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -65,13 +65,6 @@ parameters:
    - pytest
    - pytest-cov
    - pytest-xdist
- name: rocmDependencies
-  type: object
-  default:
-    - clr
-    - llvm-project
-    - ROCR-Runtime
-    - rocprofiler-sdk
 - name: rocmTestDependencies
  type: object
  default:
@@ -108,12 +101,10 @@ jobs:
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }}
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    - name: ROCM_PATH
-      value: $(Agent.BuildDirectory)/rocm
    pool:
      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
@@ -128,14 +119,6 @@ jobs:
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        gpuTarget: ${{ job.target }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -79,27 +79,27 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
+  - job: rocprofiler_sdk_build_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os}}_${{ job.target }}
+          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -107,7 +107,6 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -119,7 +118,6 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
@@ -134,7 +132,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DROCPROFILER_BUILD_TESTS=ON
@@ -146,7 +143,6 @@ jobs:
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
@@ -162,8 +158,8 @@ jobs:

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: rocprofiler_sdk_test_${{ job.os }}_${{ job.target }}
-      dependsOn: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
+    - job: rocprofiler_sdk_test_${{ job.target }}
+      dependsOn: rocprofiler_sdk_build_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -181,7 +177,6 @@ jobs:
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          pipModules: ${{ parameters.pipModules }}
-          packageManager: ${{ job.packageManager }}
          registerROCmPackages: true
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -193,7 +188,6 @@ jobs:
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmDependencies }}
-          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -208,7 +202,6 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
        parameters:
          componentName: ${{ parameters.componentName }}
-          os: ${{ job.os }}
          extraBuildFlags: >-
            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
            -DROCPROFILER_BUILD_TESTS=ON
@@ -220,8 +213,6 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          testDir: $(Agent.BuildDirectory)/build
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -226,11 +226,8 @@ jobs:
            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
        parameters:
-          cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
    # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
          extraBuildFlags: >-
-            -DCMAKE_INSTALL_PREFIX=$(Agent.BuildDirectory)/rocprofiler-systems
-            -DROCPROFSYS_USE_PYTHON=ON
            -DROCPROFSYS_BUILD_TESTING=ON
            -DROCPROFSYS_BUILD_DYNINST=ON
            -DROCPROFSYS_BUILD_LIBUNWIND=ON
@@ -248,13 +245,11 @@ jobs:
        displayName: Set up rocprofiler-systems env
        inputs:
          targetType: inline
-          script: source $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems/setup-env.sh
-          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems
+          script: source share/rocprofiler-systems/setup-env.sh
+          workingDirectory: build
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/build/tests/
-          testParameters: '--output-on-failure'
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
        parameters:
          gpuTarget: ${{ job.target }}
--- a/.azuredevops/dependencies/cli11.yml
+++ b/.azuredevops/dependencies/cli11.yml
@@ -1,63 +0,0 @@
-parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
- name: cli11Version
-  type: string
-  default: ''
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt}
-      - { os: almalinux8, packageManager: dnf}
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: cli11_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - checkout: none
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Clone cli11 ${{ parameters.cli11Version }}
-      inputs:
-        targetType: inline
-        script: git clone https://github.com/CLIUtils/CLI11.git -b ${{ parameters.cli11Version }}
-        workingDirectory: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        cmakeBuildDir: $(Agent.BuildDirectory)/CLI11/build
-        cmakeSourceDir: $(Agent.BuildDirectory)/CLI11
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
--- a/.azuredevops/dependencies/yamlcpp.yml
+++ b/.azuredevops/dependencies/yamlcpp.yml
@@ -1,66 +0,0 @@
-parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
- name: yamlcppVersion
-  type: string
-  default: ''
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt}
-      - { os: almalinux8, packageManager: dnf}
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: yamlcpp_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - checkout: none
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Clone yaml-cpp ${{ parameters.yamlcppVersion }}
-      inputs:
-        targetType: inline
-        script: git clone  https://github.com/jbeder/yaml-cpp.git -b ${{ parameters.yamlcppVersion }}
-        workingDirectory: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        cmakeBuildDir: $(Agent.BuildDirectory)/yaml-cpp/build
-        cmakeSourceDir: $(Agent.BuildDirectory)/yaml-cpp
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -DYAML_CPP_BUILD_TOOLS=OFF
-          -DYAML_BUILD_SHARED_LIBS=OFF
-          -DYAML_CPP_INSTALL=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
--- a/.azuredevops/tag-builds/cli11.yml
+++ b/.azuredevops/tag-builds/cli11.yml
@@ -1,23 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml
-
-parameters:
- name: cli11Version
-  type: string
-  default: "main"
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-
-trigger: none
-pr: none
-
-jobs:
-  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/cli11.yml
-    parameters:
-      cli11Version: ${{ parameters.cli11Version }}
--- a/.azuredevops/tag-builds/yaml-cpp.yml
+++ b/.azuredevops/tag-builds/yaml-cpp.yml
@@ -1,24 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml
-
-parameters:
- name: yamlcppVersion
-  type: string
-  default: "0.8.0"
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-
-trigger: none
-pr: none
-
-jobs:
-  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/yamlcpp.yml
-    parameters:
-      yamlcppVersion: ${{ parameters.yamlcppVersion }}
-      
--- a/.azuredevops/templates/steps/dependencies-cmake-latest.yml
+++ b/.azuredevops/templates/steps/dependencies-cmake-latest.yml
@@ -1,15 +1,10 @@
-parameters:
-  - name: cmakeVersion
-    type: string
-    default: '3.31.0'
-
 steps:
 - task: Bash@3
-  displayName: Install CMake ${{ parameters.cmakeVersion }}
+  displayName: Install CMake 3.31
  inputs:
    targetType: inline
    script: |
-      CMAKE_VERSION=${{ parameters.cmakeVersion }}
+      CMAKE_VERSION=3.31.0
      CMAKE_ROOT="$(Pipeline.Workspace)/cmake"

      echo "Downloading CMake $CMAKE_VERSION..."
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -63,7 +63,6 @@ parameters:
    libopenblas-dev: openblas-devel
    libopenmpi-dev: openmpi-devel
    libpci-dev: libpciaccess-devel
-    libsimde-dev: simde-devel
    libssl-dev: openssl-devel
    # note: libstdc++-devel is in the base packages list
    libsystemd-dev: systemd-devel
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -35,8 +35,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    amdsmi:
-      pipelineId: 376
-      developBranch: develop
+      pipelineId: 99
+      developBranch: amd-staging
      hasGpuTarget: false
    aomp-extras:
      pipelineId: 111
@@ -46,10 +46,6 @@ parameters:
      pipelineId: 115
      developBranch: aomp-dev
      hasGpuTarget: false
-    aqlprofile:
-      pipelineId: 365
-      developBranch: develop
-      hasGpuTarget: false
    clr:
      pipelineId: 335
      developBranch: develop
@@ -115,7 +111,7 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    hipTensor:
-      pipelineId: 374
+      pipelineId: 105
      developBranch: develop
      hasGpuTarget: true
    llvm-project:
@@ -130,17 +126,13 @@ parameters:
      pipelineId: 80
      developBranch: develop
      hasGpuTarget: true
-    origami:
-      pipelineId: 364
-      developBranch: develop
-      hasGpuTarget: true
    rccl:
      pipelineId: 107
      developBranch: develop
      hasGpuTarget: true
    rdc:
-      pipelineId: 360
-      developBranch: develop
+      pipelineId: 100
+      developBranch: amd-staging
      hasGpuTarget: false
    rocAL:
      pipelineId: 151
@@ -227,8 +219,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    rocprofiler-systems:
-      pipelineId: 345
-      developBranch: develop
+      pipelineId: 255
+      developBranch: amd-staging
      hasGpuTarget: true
    rocPyDecode:
      pipelineId: 239
@@ -263,7 +255,7 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    rocWMMA:
-      pipelineId: 370
+      pipelineId: 109
      developBranch: develop
      hasGpuTarget: true
    rpp:
--- a/.azuredevops/templates/steps/test.yml
+++ b/.azuredevops/templates/steps/test.yml
@@ -13,7 +13,7 @@ parameters:
  default: ctest
 - name: testParameters
  type: string
-  default: --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml
+  default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
 - name: extraTestParameters
  type: string
  default: ''
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
 .venv
 .vscode
 build
-__pycache__

 # documentation artifacts
 _build/
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -27,7 +27,6 @@ ASICs
 ASan
 ASAN
 ASm
-Async
 ATI
 atomicRMW
 AddressSanitizer
@@ -35,7 +34,6 @@ AlexNet
 Andrej
 Arb
 Autocast
-autograd
 BARs
 BatchNorm
 BLAS
@@ -45,7 +43,6 @@ Blit
 Blockwise
 Bluefield
 Bootloader
-Broadcom
 CAS
 CCD
 CDNA
@@ -65,7 +62,6 @@ CPU
 CPUs
 Cron
 CSC
-CSDATA
 CSE
 CSV
 CSn
@@ -75,11 +71,9 @@ CU
 CUDA
 CUs
 CXX
-CX
 Cavium
 CentOS
 ChatGPT
-Cholesky
 CoRR
 Codespaces
 Commitizen
@@ -87,13 +81,10 @@ CommonMark
 Concretized
 Conda
 ConnectX
-CountOnes
 CuPy
-customizable
 da
 Dashboarding
 Dataloading
-dataflows
 DBRX
 DDR
 DF
@@ -106,7 +97,6 @@ DIMM
 DKMS
 DL
 DMA
-DOMContentLoaded
 DNN
 DNNL
 DPM
@@ -125,8 +115,6 @@ Dependabot
 Deprecations
 DevCap
 DirectX
-Disaggregated
-disaggregated
 Dockerfile
 Dockerized
 Doxygen
@@ -135,13 +123,9 @@ ELMo
 ENDPGM
 EPYC
 ESXi
-EP
 EoS
-etcd
 fas
 FBGEMM
-FiLM
-FIFOs
 FFT
 FFTs
 FFmpeg
@@ -154,19 +138,15 @@ Filesystem
 FindDb
 Flang
 FlashAttention
-FlashInfer’s
-FlashInfer
 FluxBenchmark
 Fortran
 Fuyu
 GALB
 GAT
-GATNE
 GCC
 GCD
 GCDs
 GCN
-GCNN
 GDB
 GDDR
 GDR
@@ -185,19 +165,15 @@ Glibc
 GLXT
 Gloo
 GMI
-GNN
-GNNs
 GPG
 GPR
 GPT
 GPU
 GPU's
-GPUDirect
 GPUs
-GraphBolt
+Graphbolt
 GraphSage
 GRBM
-GRE
 GenAI
 GenZ
 GitHub
@@ -224,9 +200,7 @@ Haswell
 Higgs
 href
 Hyperparameters
-HybridEngine
 Huggingface
-IB
 ICD
 ICT
 ICV
@@ -235,11 +209,8 @@ IDEs
 IFWI
 IMDb
 IncDec
-instrSize
-interpolators
 IOMMU
 IOP
-IOPS
 IOPM
 IOV
 IRQ
@@ -256,7 +227,6 @@ Intersphinx
 Intra
 Ioffe
 JAX's
-JAXLIB
 Jinja
 JSON
 Jupyter
@@ -277,16 +247,12 @@ LLM
 LLMs
 LLVM
 LM
-logsumexp
-LRU
 LSAN
 LSan
 LTS
 LSTMs
-LteAll
 LanguageCrossEntropy
 LoRA
-MECO
 MEM
 MERCHANTABILITY
 MFMA
@@ -305,7 +271,6 @@ MNIST
 MPI
 MPT
 MSVC
-mul
 MVAPICH
 MVFFR
 Makefile
@@ -313,7 +278,6 @@ Makefiles
 Matplotlib
 Matrox
 MaxText
-MBT
 Megablocks
 Megatrends
 Megatron
@@ -323,15 +287,11 @@ Meta's
 Miniconda
 MirroredStrategy
 Mixtral
-MLA
 MosaicML
 MoEs
-Mooncake
 Mpops
 Multicore
 Multithreaded
-mx
-MXFP
 MyEnvironment
 MyST
 NANOO
@@ -367,7 +327,6 @@ OFED
 OMM
 OMP
 OMPI
-OOM
 OMPT
 OMPX
 ONNX
@@ -390,11 +349,9 @@ PCC
 PCI
 PCIe
 PEFT
-perf
 PEQT
 PIL
 PILImage
-PJRT
 POR
 PRNG
 PRs
@@ -414,7 +371,6 @@ Profiler's
 PyPi
 Pytest
 PyTorch
-QPS
 Qcycles
 Qwen
 RAII
@@ -476,9 +432,7 @@ SKU
 SKUs
 SLES
 SLURM
-Slurm
 SMEM
-SMFMA
 SMI
 SMT
 SPI
@@ -490,24 +444,18 @@ SWE
 SerDes
 ShareGPT
 Shlens
-simd
 Skylake
 Softmax
 Spack
 SplitK
 Supermicro
 Szegedy
-TagRAM
 TCA
 TCC
-TCCs
 TCI
 TCIU
 TCP
 TCR
-TVM
-THREADGROUPS
-threadgroups
 TensorRT
 TensorFloat
 TF
@@ -551,7 +499,6 @@ UltraChat
 Uncached
 Unittests
 Unhandled
-unwindowed
 VALU
 VBIOS
 VCN
@@ -568,13 +515,11 @@ Vanhoucke
 Vulkan
 WGP
 WGPs
-WR
 WX
 WikiText
 Wojna
 Workgroups
 Writebacks
-xcc
 XCD
 XCDs
 XGBoost
@@ -595,7 +540,6 @@ ZenDNN
 accuracies
 activations
 addr
-addEventListener
 ade
 ai
 alloc
@@ -611,7 +555,6 @@ autogenerated
 autotune
 avx
 awk
-az
 backend
 backends
 bb
@@ -629,7 +572,6 @@ boson
 bosons
 br
 BrainFloat
-btn
 buildable
 bursty
 bzip
@@ -641,21 +583,18 @@ centric
 changelog
 checkpointing
 chiplet
-classList
 cmake
 cmd
 coalescable
 codename
 collater
 comgr
-compat
 completers
 composable
 concretization
 config
 configs
 conformant
-const
 constructible
 convolutional
 convolves
@@ -690,14 +629,12 @@ denoised
 denoises
 denormalize
 dequantization
-dequantized
 dequantizes
 deserializers
 detections
 dev
 devicelibs
 devsel
-dgl
 dimensionality
 disambiguates
 distro
@@ -721,7 +658,6 @@ exascale
 executables
 ffmpeg
 filesystem
-forEach
 fortran
 fp
 framebuffer
@@ -730,14 +666,12 @@ galb
 gcc
 gdb
 gemm
-getAttribute
 gfortran
 gfx
 githooks
 github
 globals
 gnupg
-gpu
 grayscale
 gx
 gzip
@@ -792,7 +726,6 @@ invariants
 invocating
 ipo
 jax
-json
 kdb
 kfd
 kv
@@ -806,7 +739,6 @@ linalg
 linearized
 linter
 linux
-llm
 llvm
 lm
 localscratch
@@ -815,7 +747,6 @@ lossy
 macOS
 matchers
 maxtext
-megablocks
 megatron
 microarchitecture
 migraphx
@@ -844,7 +775,6 @@ opencv
 openmp
 openssl
 optimizers
-ol
 os
 oversubscription
 pageable
@@ -857,7 +787,6 @@ passthrough
 pe
 perfcounter
 performant
-piecewise
 perl
 pragma
 pre
@@ -893,8 +822,6 @@ recommenders
 quantile
 quantizer
 quasirandom
-querySelector
-querySelectorAll
 queueing
 qwen
 radeon
@@ -913,8 +840,6 @@ req
 resampling
 rescaling
 reusability
-rhel
-rl
 RLHF
 roadmap
 roc
@@ -959,23 +884,19 @@ scalability
 scalable
 scipy
 seealso
-selectedTag
 sendmsg
 seqs
 serializers
-setAttribute
 sglang
 shader
 sharding
 sigmoid
-sles
 sm
 smi
 softmax
 spack
 spmm
 src
-stanford
 stochastically
 strided
 subcommand
@@ -992,10 +913,8 @@ symlink
 symlinks
 sys
 tabindex
-targetContainer
 td
 tensorfloat
-tf
 th
 tokenization
 tokenize
@@ -1004,12 +923,10 @@ tokenizer
 tokenizes
 toolchain
 toolchains
-topk
 toolset
 toolsets
 torchtitan
 torchvision
-tp
 tqdm
 tracebacks
 txt
@@ -1032,7 +949,6 @@ USM
 UTCL
 UTIL
 utils
-UX
 vL
 variational
 vdi
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/RELEASE.md
+++ b/RELEASE.md
--- a/default.xml
+++ b/default.xml
@@ -1,17 +1,33 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-7.1.1"
+    <default revision="refs/tags/rocm-7.0.0"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
 <!--list of projects for ROCm-->
    <project name="ROCK-Kernel-Driver" />
+    <project name="ROCR-Runtime" />
    <project name="amdsmi" />
+    <project name="aqlprofile" />
+    <project name="rdc" />
    <project name="rocm_bandwidth_test" />
+    <project name="rocm_smi_lib" />
+    <project name="rocm-core" />
    <project name="rocm-examples" />
+    <project name="rocminfo" />
+    <project name="rocprofiler" />
+    <project name="rocprofiler-register" />
+    <project name="rocprofiler-sdk" />
+    <project name="rocprofiler-compute" />
+    <project name="rocprofiler-systems" />
+    <project name="roctracer" />
 <!--HIP Projects-->
+    <project name="hip" />
+    <project name="hip-tests" />
    <project name="HIPIFY" />
+    <project name="clr" />
+    <project name="hipother" />
 <!-- The following projects are all associated with the AMDGPU LLVM compiler -->
    <project name="half" />
    <project name="llvm-project" />
@@ -38,16 +54,21 @@
        hipFFT hipRAND hipSPARSE hipSPARSELt
        MIOpen rocBLAS rocFFT rocPRIM rocRAND
        rocSPARSE rocThrust Tensile -->
-    <project groups="mathlibs" name="rocm-libraries" />
-    <!-- The following components have been migrated to rocm-systems:
-        aqlprofile clr hip hip-tests hipother
-        rdc rocm-core rocm_smi_lib rocminfo rocprofiler-compute 
-        rocprofiler-register rocprofiler-sdk rocprofiler-systems 
-        rocprofiler rocr-runtime roctracer -->
-    <project groups="mathlibs" name="rocm-systems" />
+    <project groups="mathlibs" name="rocm-libraries">
+        <linkfile src="projects/hipcub" dest="hipCUB"/>
+        <linkfile src="projects/rocprim" dest="rocPRIM"/>
+        <linkfile src="projects/hiprand" dest="hipRAND"/>
+        <linkfile src="projects/rocrand" dest="rocRAND"/>
+        <linkfile src="projects/rocthrust" dest="rocThrust"/>
+        <linkfile src="projects/hipblas-common" dest="hipBLAS-common"/>
+        <linkfile src="projects/hipblaslt" dest="hipBLASLt"/>
+        <linkfile src="projects/rocblas" dest="rocBLAS"/>
+        <linkfile src="projects/hipsparselt" dest="hipSPARSELt"/>
+        <linkfile src="projects/rocsparse" dest="rocSPARSE"/>
+    </project>
    <project groups="mathlibs" name="rocPyDecode" />
-    <project groups="mathlibs" name="rocSOLVER" />
    <project groups="mathlibs" name="rocSHMEM" />
+    <project groups="mathlibs" name="rocSOLVER" />
    <project groups="mathlibs" name="rocWMMA" />
    <project groups="mathlibs" name="rocm-cmake" />
    <project groups="mathlibs" name="rpp" />
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -25,69 +25,69 @@ additional licenses. Please review individual repositories for more information.
 <!-- spellcheck-disable -->
 | Component | License |
 |:---------------------|:-------------------------|
-| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/LICENSE.md) |
+| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/LICENSE.txt) |
 | [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
 | [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
-| [AQLprofile](https://github.com/ROCm/rocm-systems/tree/develop/projects/aqlprofile/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/aqlprofile/LICENSE.md) |
+| [AQLprofile] | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
 | [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
 | [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
 | [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
-| [HIP](https://github.com/ROCm/rocm-systems/tree/develop/projects/hip/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/hip/LICENSE.md) |
-| [hipamd](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/hipamd/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/hipamd/LICENSE.md) |
-| [hipBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblas/LICENSE.md) |
-| [hipBLASLt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/LICENSE.md) |
+| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/amd-staging/LICENSE.txt) |
+| [hipamd](https://github.com/ROCm/clr/tree/amd-staging/hipamd) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/hipamd/LICENSE.txt) |
+| [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
+| [hipBLASLt](https://github.com/ROCm/hipBLASLt/) | [MIT](https://github.com/ROCm/hipBLASLt/blob/develop/LICENSE.md) |
 | [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
-| [hipCUB](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub/) | [Custom](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipcub/LICENSE.txt) |
-| [hipFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/LICENSE.md) |
+| [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
+| [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
 | [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
 | [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
-| [hipRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiprand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiprand/LICENSE.md) |
-| [hipSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsolver/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsolver/LICENSE.md) |
-| [hipSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparse/LICENSE.md) |
-| [hipSPARSELt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparselt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparselt/LICENSE.md) |
-| [hipTensor](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiptensor/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiptensor/LICENSE) |
+| [hipRAND](https://github.com/ROCm/hipRAND/) | [MIT](https://github.com/ROCm/hipRAND/blob/develop/LICENSE.txt) |
+| [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
+| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
+| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
+| [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
 | [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
 | [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
-| [MIOpen](https://github.com/ROCm/rocm-libraries/tree/develop/projects/miopen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
+| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
 | [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
 | [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
 | [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
-| [rocBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocblas/LICENSE.md) |
+| [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
 | [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
 | [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
-| [rocFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocfft/LICENSE.md) |
+| [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
 | [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v3.0](https://github.com/ROCm/ROCgdb/blob/amd-staging/COPYING3) |
 | [rocJPEG](https://github.com/ROCm/rocJPEG/) | [MIT](https://github.com/ROCm/rocJPEG/blob/develop/LICENSE) |
 | [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
-| [rocminfo](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocminfo/License.txt) |
+| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
 | [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [MIT](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
 | [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
 | [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
-| [ROCm-Core](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-core/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-core/LICENSE.md) |
-| [ROCm Compute Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-compute/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-compute/LICENSE.md) |
-| [ROCm Data Center (RDC)](https://github.com/ROCm/rocm-systems/tree/develop/projects/rdc/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rdc/LICENSE.md) |
+| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
+| [ROCm Compute Profiler](https://github.com/ROCm/rocprofiler-compute) | [MIT](https://github.com/ROCm/rocprofiler-compute/blob/amd-staging/LICENSE) |
+| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE.md) |
 | [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
-| [ROCm-OpenCL-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/opencl/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/opencl/LICENSE.md) |
+| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/amd-staging/opencl) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/opencl/LICENSE.txt) |
 | [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
-| [ROCm SMI Lib](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-smi-lib/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-smi-lib/LICENSE.md) |
-| [ROCm Systems Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-systems/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-systems/LICENSE.md) |
+| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/LICENSE.md) |
+| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE.md) |
 | [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
-| [rocPRIM](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocprim/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocprim/LICENSE.md) |
-| [ROCProfiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler/LICENSE.md) |
-| [ROCprofiler-SDK](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-sdk/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-sdk/LICENSE.md) |
+| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
+| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE.md) |
+| [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
 | [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
-| [rocRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocrand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocrand/LICENSE.md) |
+| [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
 | [ROCr Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
-| [ROCR-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocr-runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocr-runtime/LICENSE.txt) |
+| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/amd-staging/LICENSE.txt) |
 | [rocSHMEM](https://github.com/ROCm/rocSHMEM/) | [MIT](https://github.com/ROCm/rocSHMEM/blob/develop/LICENSE.md) |
-| [rocSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsolver/) | [BSD-2-Clause](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsolver/LICENSE.md) |
-| [rocSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsparse/LICENSE.md) |
-| [rocThrust](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocthrust/) | [Apache 2.0](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocthrust/LICENSE) |
-| [ROCTracer](https://github.com/ROCm/rocm-systems/tree/develop/projects/roctracer/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/roctracer/LICENSE.md) |
-| [rocWMMA](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocwmma/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocwmma/LICENSE.md) |
-| [Tensile](https://github.com/ROCm/rocm-libraries/tree/develop/shared/tensile/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/shared/tensile/LICENSE.md) |
+| [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
+| [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
+| [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
+| [ROCTracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
+| [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
+| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
 | [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |

 Open sourced ROCm components are released via public GitHub
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -1,137 +1,133 @@
-ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
-      :ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility-past-60]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
-      ,,,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
-      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
-      ,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8",Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,,,
-      ,"Debian 13, 12","Debian 13, 12","Debian 13, 12",Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,,,,,,,,,,,
-      ,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
-      ,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,,,,,,,,,,,,,,,,,,
-      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
-      ,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
-      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
-      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
-      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`  [#gpu-compatibility-past-60]_,gfx950,gfx950,gfx950,gfx950,,,,,,,,,,,,,,,,,,
-      ,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,,,,,,,,,,,,,,,
-      ,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,,,,,,,,,,,,,,,
-      ,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,,,,,,,,,,,,,,,
-      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
-      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
-      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942, gfx942, gfx942, gfx942, gfx942, gfx942, gfx942
-      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
-      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
-      ,,,,,,,,,,,,,,,,,,,,,,
-      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
-      ,,,,,,,,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,,,,,,,,
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
-      ,,,,,,,,,,,,,,,,,,,,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      Thrust,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      CUB,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      ,,,,,,,,,,,,,,,,,,,,,,
-     DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
-      ,,,,,,,,,,,,,,,,,,,,,,
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
-      :doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
-      :doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`RPP <rpp:index>`,2.1.0,2.1.0,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
-      ,,,,,,,,,,,,,,,,,,,,,,
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      ,,,,,,,,,,,,,,,,,,,,,,
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
-      :doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
-      :doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
-      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
-      ,,,,,,,,,,,,,,,,,,,,,,
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      ,,,,,,,,,,,,,,,,,,,,,,
-      SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
-      ,,,,,,,,,,,,,,,,,,,,,,
-      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
-      :doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
-      ,,,,,,,,,,,,,,,,,,,,,,
-      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
-      ,,,,,,,,,,,,,,,,,,,,,,
-      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,,,
-      :doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
-      ,,,,,,,,,,,,,,,,,,,,,,
-      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
-      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      ,,,,,,,,,,,,,,,,,,,,,,
-      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      :doc:`HIP <hip:index>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
+ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
+      ,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,"Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
+      ,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
+      ,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
+      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+,,,,,,,,,,,,,,,,,,
+      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat]_,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,N/A,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+,,,,,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,,,
+      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      ,,,,,,,,,,,,,,,,,,
+      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      Thrust,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      CUB,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+,,,,,,,,,,,,,,,,,,
+      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      ,,,,,,,,,,,,,,,,,,
+      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      ,,,,,,,,,,,,,,,,,,
+      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      ,,,,,,,,,,,,,,,,,,
+      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      ,,,,,,,,,,,,,,,,,,
+      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,,,,,,,,,,,,,
+      SUPPORT LIBS,,,,,,,,,,,,,,,,,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      ,,,,,,,,,,,,,,,,,,
+      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`AMD SMI <amdsmi:index>`,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      ,,,,,,,,,,,,,,,,,,
+      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      ,,,,,,,,,,,,,,,,,,
+      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,
+      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      ,,,,,,,,,,,,,,,,,,
+      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+,,,,,,,,,,,,,,,,,,
+      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`HIP <hip:index>`,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -10,9 +10,10 @@ Use this matrix to view the ROCm compatibility and system requirements across su

 You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-rocm-compatibility-matrix>`.

-GPUs listed in the following table support compute workloads (no display
-information or graphics). If you’re using ROCm with AMD Radeon GPUs or Ryzen APUs for graphics
-workloads, see the :doc:`Use ROCm on Radeon and Ryzen <radeon:index>` to verify
+Accelerators and GPUs listed in the following table support compute workloads (no display
+information or graphics). If you’re using ROCm with AMD Radeon or Radeon Pro GPUs for graphics
+workloads, see the `Use ROCm on Radeon GPU documentation
+<https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility.html>`_ to verify
 compatibility and system requirements.

 .. |br| raw:: html
@@ -22,31 +23,28 @@ compatibility and system requirements.
 .. container:: format-big-table

  .. csv-table::
-      :header: "ROCm Version", "7.1.1", "7.1.0", "6.4.0"
+      :header: "ROCm Version", "6.4.3", "6.4.2", "6.3.0"
      :stub-columns: 1

-      :ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2
      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
-      ,"RHEL 10.1, 10.0,  9.7, |br| 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.5, 9.4"
+      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.5, 9.4"
      ,RHEL 8.10,RHEL 8.10,RHEL 8.10
-      ,SLES 15 SP7,SLES 15 SP7,SLES 15 SP6
-      ,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8"
-      ,"Debian 13, 12","Debian 13, 12",Debian 12
-      ,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0
-      ,Rocky Linux 9,Rocky Linux 9,
+      ,"SLES 15 SP7, SP6","SLES 15 SP7, SP6","SLES 15 SP6, SP5"
+      ,"Oracle Linux 9, 8 [#mi300x]_","Oracle Linux 9, 8 [#mi300x]_",Oracle Linux 8.10 [#mi300x]_
+      ,Debian 12 [#single-node]_,Debian 12 [#single-node]_,
+      ,Azure Linux 3.0 [#mi300x]_,Azure Linux 3.0 [#mi300x]_,
      ,.. _architecture-support-compatibility-matrix:,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,
-      ,CDNA3,CDNA3,CDNA3
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
      ,CDNA2,CDNA2,CDNA2
      ,CDNA,CDNA,CDNA
      ,RDNA4,RDNA4,
      ,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix:,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility]_,gfx950,gfx950,
-      ,gfx1201,gfx1201,
-      ,gfx1200,gfx1200,
-      ,gfx1101,gfx1101,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS]_,gfx1201 [#RDNA-OS]_,
+      ,gfx1200 [#RDNA-OS]_,gfx1200 [#RDNA-OS]_,
+      ,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,
      ,gfx1100,gfx1100,gfx1100
      ,gfx1030,gfx1030,gfx1030
      ,gfx942,gfx942,gfx942
@@ -54,116 +52,116 @@ compatibility and system requirements.
      ,gfx908,gfx908,gfx908
      ,,,
      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.6, 2.5, 2.4, 2.3"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.4.35
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0
-      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,b5997
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,85f95ae
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,0.7.0
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3
      ,,,
      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0
      ,,,
      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
-      Thrust,2.8.5,2.8.5,2.5.0
-      CUB,2.8.5,2.8.5,2.5.0
+      Thrust,2.5.0,2.5.0,2.3.2
+      CUB,2.5.0,2.5.0,2.3.2
      ,,,
-      DRIVER & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
-      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM]_, |br| 30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x","30.20.0 [#mi325x_KVM]_, 30.10.2, |br| 30.10.1 [#driver_patch]_, 30.10, 6.4.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
+      KMD & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
      ,,,
      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.12.0
-      :doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.4.0
-      :doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.2.0
-      :doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.2.0
-      :doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,0.10.0
-      :doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,0.8.0
-      :doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.3.1
-      :doc:`RPP <rpp:index>`,2.1.0,2.1.0,1.9.10
+      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.11.0
+      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.1.0
+      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.1.0
+      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.8.0
+      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.6.0
+      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.2.0
+      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.1
      ,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
-      :doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.22.3
-      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.0
+      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.1,2.0.1,N/A
      ,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,2.4.0
-      :doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,0.12.0
-      :doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.18
-      :doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.6.0
-      :doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,2.12.0
-      :doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,2.4.0
-      :doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,3.2.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.3
-      :doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,3.2.2
-      :doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,4.4.0
-      :doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.32
-      :doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,3.3.0
-      :doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.28.0
-      :doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,3.4.0
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0
-      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
+      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.3.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.1,0.10.0
+      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.17
+      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.5.0
+      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.11.0
+      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.3.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.1.2
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.2
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.3,3.2.1
+      :doc:`rocBLAS <rocblas:index>`,4.4.1,4.4.1,4.3.0
+      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.31
+      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.2.0
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.2,3.28.2,3.27.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.6.0
+      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.42.0
      ,,,
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
-      :doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,3.4.0
-      :doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0
-      :doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,3.4.0
-      :doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,3.3.0
+      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.4.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.1,3.4.1,3.3.0
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0
      ,,,
      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,6.4.43482
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,6.4.0
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43483,6.3.42131
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.3,6.4.2,6.3.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,25.3.0
-      :doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,0.3.0
+      :doc:`AMD SMI <amdsmi:index>`,25.5.1,25.5.1,24.7.1
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.7.0,7.5.0,7.4.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0
      ,,,
      PERFORMANCE TOOLS,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.1.0
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.0.0
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.60400
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0
-      :doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.60400
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.1,3.1.1,3.0.0
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.2,1.0.2,0.1.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60403,2.0.60402,2.0.60300
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.5.0
+      :doc:`ROCTracer <roctracer:index>`,4.1.60403,4.1.60402,4.1.60300
      ,,,
      DEVELOPMENT TOOLS,,,
-      :doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0
+      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,18.0.0.24455
      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.2
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.3
      ,,,
      COMPILERS,.. _compilers-support-compatibility-matrix:,,
      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
-      `Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,19.0.0.25133
-      :doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,19.0.0.25133
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,19.0.0.25133
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25224,19.0.0.25224,18.0.0.24455
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25224,19.0.0.25224,18.0.0.24491
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25224,19.0.0.25224,18.0.0.24491
      ,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,6.4.43482
-      :doc:`HIP <hip:index>`,7.1.52802,7.1.25424,6.4.43482
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43484,6.4.43484,6.3.42131
+      :doc:`HIP <hip:index>`,6.4.43484,6.4.43484,6.3.42131
      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.14.0
+

 .. rubric:: Footnotes

-.. [#os-compatibility] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`_, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`_, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`_.
-.. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`_, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`_, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`_.
-.. [#dgl_compat] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
-.. [#llama-cpp_compat] llama.cpp is supported only on ROCm 7.0.0 and ROCm 6.4.x.
-.. [#mi325x_KVM] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
-.. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
-.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+.. [#mi300x] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
+.. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+.. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
+.. [#7700XT-OS] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
+.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.

+
 .. _OS-kernel-versions:

 Operating systems, kernel and Glibc versions
@@ -176,36 +174,28 @@ Use this lookup table to confirm which operating system and kernel versions are
   :widths: 40, 20, 30, 20
   :stub-columns: 1

-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.3, "6.8 [GA], 6.14 [HWE]", 2.39
+   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 GA, 6.11 HWE", 2.39
   ,,
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 [GA], 6.11 [HWE]", 2.39
+   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 GA, 6.8 HWE", 2.35
   ,,
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
-   ,,
-   `Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.1, 6.12.0-124, 2.39
-   ,10.0, 6.12.0-55, 2.39
-   ,,
-   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.7, 5.14.0-611, 2.34
-   ,9.6, 5.14.0-570, 2.34
+   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14+, 2.34
   ,9.5, 5.14+, 2.34
-   ,9.4, 5.14.0-427, 2.34
+   ,9.4, 5.14+, 2.34
+   ,9.3, 5.14+, 2.34
   ,,
-   `Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0-553, 2.28
+   `Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0+, 2.28
+   ,8.9, 4.18.0, 2.28
   ,,
-   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.40-150700.51, 2.38
+   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.11.0+, 2.38
   ,15 SP6, "6.5.0+, 6.4.0", 2.38
   ,15 SP5, 5.14.21, 2.31
   ,,
-   `Rocky Linux <https://wiki.rockylinux.org/rocky/version/>`_, 9, 5.14.0-570, 2.34
-   ,,
-   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 10, 6.12.0 (UEK), 2.39
-   ,9, 6.12.0 (UEK), 2.34
+   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 9, 5.15.0 (UEK), 2.35
   ,8, 5.15.0 (UEK), 2.28
   ,,
-   `Debian <https://www.debian.org/download>`_,13, 6.12, 2.35
-   ,12, 6.1.0, 2.36
+   `Debian <https://www.debian.org/download>`_,12, 6.1, 2.36
   ,,
-   `Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.92, 2.38
+   `Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.60, 2.38
   ,,

 .. note::
@@ -238,19 +228,26 @@ Expand for full historical view of:

   .. rubric:: Footnotes

-   .. [#os-compatibility-past-60] Some operating systems are supported on limited GPUs. For detailed information, see :ref:`supported_distributions` and select the required ROCm version for version specific support.
-   .. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see :ref:`supported_GPUs` and select the required ROCm version for version specific support.
-   .. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
-   .. [#verl_compat-past-60] verl is supported only on ROCm 6.2.0.
-   .. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
-   .. [#dgl_compat-past-60] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
-   .. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
-   .. [#taichi_compat-past-60] Taichi is supported only on ROCm 6.3.2.
-   .. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
-   .. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
-   .. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
-   .. [#mi325x_KVM-past-60] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
-   .. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
-   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+   .. [#mi300x-past-60] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
+   .. [#single-node-past-60] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
+   .. [#7700XT-OS-past-60] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
+   .. [#mi300_624-past-60] **For ROCm 6.2.4** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
+   .. [#mi300_622-past-60] **For ROCm 6.2.2** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
+   .. [#mi300_621-past-60] **For ROCm 6.2.1** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
+   .. [#mi300_620-past-60] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
+   .. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
+   .. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
+   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
+   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
+   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
+   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
+   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
+   .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
+   .. [#ray_compat] Ray is only supported on ROCm 6.4.1.
+   .. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
+   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Deep Graph Library (DGL) compatibility
-    :keywords: GPU, CPU, deep graph library, DGL, deep learning, framework compatibility
+    :keywords: GPU, DGL compatibility

 .. version-set:: rocm_version latest

@@ -10,42 +10,24 @@
 DGL compatibility
 ********************************************************************************

-Deep Graph Library (`DGL <https://www.dgl.ai/>`__) is an easy-to-use, high-performance, and scalable 
+Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable 
 Python package for deep learning on graphs. DGL is framework agnostic, meaning 
-that if a deep graph model is a component in an end-to-end application, the rest of 
+if a deep graph model is a component in an end-to-end application, the rest of 
 the logic is implemented using PyTorch.  

-DGL provides a high-performance graph object that can reside on either CPUs or GPUs. 
-It bundles structural data features for better control and provides a variety of functions 
-for computing with graph objects, including efficient and customizable message passing 
-primitives for Graph Neural Networks.
+* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository. 
+* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository. 
+* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
+* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` 
+  to install and get started.

-Support overview
-================================================================================
-
- The ROCm-supported version of DGL is maintained in the official `https://github.com/ROCm/dgl 
-  <https://github.com/ROCm/dgl>`__ repository, which differs from the 
-  `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`__ upstream repository.
-
- To get started and install DGL on ROCm, use the prebuilt :ref:`Docker images <dgl-docker-compat>`, 
-  which include ROCm, DGL, and all required dependencies.
-
-  - See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` 
-    for installation and setup instructions.
-
-  - You can also consult the upstream `Installation guide <https://www.dgl.ai/pages/start.html>`__ 
-    for additional context.
-
-Version support
--------------------------------------------------------------------------------
-
-DGL is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__, 
-`ROCm 6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__, and `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.

 Supported devices
--------------------------------------------------------------------------------
+================================================================================
+
+- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
+- **Partially Supported**: TF32 with AMD Instinct MI250X

-**Officially Supported**: AMD Instinct™ MI300X, MI250X

 .. _dgl-recommendations:

@@ -53,42 +35,23 @@ Use cases and recommendations
 ================================================================================

 DGL can be used for Graph Learning, and building popular graph models like  
-GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
+GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:

 - Recommender systems
 - Network Optimization and Analysis
 - 1D (Temporal) and 2D (Image) Classification
 - Drug Discovery

-For use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for DGL examples and best practices to optimize your workloads on AMD GPUs.
+Multiple use cases of DGL have been tested and verified.
+However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
+Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
+where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs. 

-* Although multiple use cases of DGL have been tested and verified, a few have been  
-  outlined in the `DGL in the Real World: Running GNNs on Real Use Cases 
-  <https://rocm.blogs.amd.com/artificial-intelligence/dgl_blog2/README.html>`__ blog 
-  post, which walks through four real-world graph neural network (GNN) workloads 
-  implemented with the Deep Graph Library on ROCm. It covers tasks ranging from 
-  heterogeneous e-commerce graphs and multiplex networks (GATNE) to molecular graph 
-  regression (GNN-FiLM) and EEG-based neurological diagnosis (EEG-GCNN). For each use 
-  case, the authors detail: the dataset and task, how DGL is used, and their experience 
-  porting to ROCm. It is shown that DGL codebases often run without modification, with 
-  seamless integration of graph operations, message passing, sampling, and convolution. 
+Coverage includes:

-* The `Graph Neural Networks (GNNs) at Scale: DGL with ROCm on AMD Hardware 
-  <https://rocm.blogs.amd.com/artificial-intelligence/why-graph-neural/README.html>`__ 
-  blog post introduces the Deep Graph Library (DGL) and its enablement on the AMD ROCm platform, 
-  bringing high-performance graph neural network (GNN) training to AMD GPUs. DGL bridges 
-  the gap between dense tensor frameworks and the irregular nature of graph data through a 
-  graph-first, message-passing abstraction. Its design ensures scalability, flexibility, and 
-  interoperability across frameworks like PyTorch and TensorFlow. AMD’s ROCm integration 
-  enables DGL to run efficiently on HIP-based GPUs, supported by prebuilt Docker containers 
-  and open-source repositories. This marks a major step in AMD's mission to advance open, 
-  scalable AI ecosystems beyond traditional architectures.
+- Single-GPU training/inference
+- Multi-GPU training

-You can pre-process datasets and begin training on AMD GPUs through:
-
-* Single-GPU training/inference
-* Multi-GPU training

 .. _dgl-docker-compat:

@@ -99,17 +62,16 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl/tags>`__
-with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories represent the latest available DGL version from the official Docker Hub. 
+AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
 Click the |docker-icon| to view the image on Docker Hub.

-.. list-table::
+.. list-table:: DGL Docker image components
    :header-rows: 1
    :class: docker-image-compatibility

-    * - Docker image
-      - ROCm
+    * - Docker
      - DGL
      - PyTorch
      - Ubuntu
@@ -117,195 +79,130 @@ Click the |docker-icon| to view the image on Docker Hub.

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.8.0/images/sha256-943698ddf54c22a7bcad2e5b4ff467752e29e4ba6d0c926789ae7b242cbd92dd"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>

-      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.8.0 <https://github.com/pytorch/pytorch/releases/tag/v2.8.0>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-b2ec286a035eb7d0a6aab069561914d21a3cac462281e9c024501ba5ccedfbf7"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>

-      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu22.04_py3.10_pytorch_2.7.1/images/sha256-d27aee16df922ccf0bcd9107bfcb6d20d34235445d456c637e33ca6f19d11a51"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>

-      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.7.1 <https://github.com/pytorch/pytorch/releases/tag/v2.7.1>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm6.4.3_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-f3ba6a3c9ec9f6c1cde28449dc9780e0c4c16c4140f4b23f158565fbfd422d6b"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
-
-      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
-      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
-
-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
-      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
-
-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
-      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_


    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>

-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
-
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
-
-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.3.0 <https://github.com/pytorch/pytorch/releases/tag/v2.3.0>`__
-      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      

 Key ROCm libraries for DGL
 ================================================================================

 DGL on ROCm depends on specific libraries that affect its features and performance.
-Using the DGL Docker container or building it with the provided Docker file or a ROCm base image is recommended.
+Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
 If you prefer to build it yourself, ensure the following dependencies are installed:

 .. list-table:: 
    :header-rows: 1

    * - ROCm library
-      - ROCm 7.0.0 Version
-      - ROCm 6.4.x Version
+      - Version
      - Purpose
    * - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
-      - 1.1.0
-      - 1.1.0
+      - :version-ref:`"Composable Kernel" rocm_version`
      - Enables faster execution of core operations like matrix multiplication
        (GEMM), convolutions and transformations.
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
-      - 3.0.0
-      - 2.4.0
+      - :version-ref:`hipBLAS rocm_version`
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
-      - 1.0.0
-      - 0.12.0
+      - :version-ref:`hipBLASLt rocm_version`
      - hipBLASLt is an extension of the hipBLAS library, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
-      - 4.0.0
-      - 3.4.0
+      - :version-ref:`hipCUB rocm_version`
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
-      - 1.0.20
-      - 1.0.18
+      - :version-ref:`hipFFT rocm_version`
      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
-      - 3.0.0
-      - 2.12.0
+      - :version-ref:`hipRAND rocm_version`
      - Provides fast random number generation for GPUs.
    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
-      - 3.0.0
-      - 2.4.0
+      - :version-ref:`hipSOLVER rocm_version`
      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
        singular value decompositions (SVD).
    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
-      - 4.0.1
-      - 3.2.0
+      - :version-ref:`hipSPARSE rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
-      - 0.2.4
-      - 0.2.3
+      - :version-ref:`hipSPARSELt rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
    * - `hipTensor <https://github.com/ROCm/hipTensor>`_
-      - 2.0.0
-      - 1.5.0
+      - :version-ref:`hipTensor rocm_version`
      - Optimizes for high-performance tensor operations, such as contractions.
    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
-      - 3.5.0
-      - 3.4.0
+      - :version-ref:`MIOpen rocm_version`
      - Optimizes deep learning primitives such as convolutions, pooling,
        normalization, and activation functions.
    * - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
-      - 2.13.0
-      - 2.12.0
+      - :version-ref:`MIGraphX rocm_version`
      - Adds graph-level optimizations, ONNX models and mixed precision support
        and enable Ahead-of-Time (AOT) Compilation.
    * - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
-      - 3.3.0
-      - 3.2.0
+      - :version-ref:`MIVisionX rocm_version`
      - Optimizes acceleration for computer vision and AI workloads like
        preprocessing, augmentation, and inferencing.
    * - `rocAL <https://github.com/ROCm/rocAL>`_
-      - 3.3.0
-      - 2.2.0
+      - :version-ref:`rocAL rocm_version`
      - Accelerates the data pipeline by offloading intensive preprocessing and
        augmentation tasks. rocAL is part of MIVisionX.
    * - `RCCL <https://github.com/ROCm/rccl>`_
-      - 2.26.6
-      - 2.22.3
+      - :version-ref:`RCCL rocm_version`
      - Optimizes for multi-GPU communication for operations like AllReduce and
        Broadcast.
    * - `rocDecode <https://github.com/ROCm/rocDecode>`_
-      - 1.0.0
-      - 0.10.0
+      - :version-ref:`rocDecode rocm_version`
      - Provides hardware-accelerated data decoding capabilities, particularly
        for image, video, and other dataset formats.
    * - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
-      - 1.1.0
-      - 0.8.0
+      - :version-ref:`rocJPEG rocm_version`
      - Provides hardware-accelerated JPEG image decoding and encoding.
    * - `RPP <https://github.com/ROCm/RPP>`_
-      - 2.0.0
-      - 1.9.10
+      - :version-ref:`RPP rocm_version`
      - Speeds up data augmentation, transformation, and other preprocessing steps.
    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
-      - 4.0.0
-      - 3.3.0
+      - :version-ref:`rocThrust rocm_version`
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
-      - 2.0.0
-      - 1.7.0
+      - :version-ref:`rocWMMA rocm_version`
      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
        multiplication (GEMM) and accumulation operations with mixed precision
        support.
@@ -314,14 +211,14 @@ If you prefer to build it yourself, ensure the following dependencies are instal
 Supported features
 ================================================================================

-Many functions and methods available upstream are also supported in DGL on ROCm.
+Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
 Instead of listing them all, support is grouped into the following categories to provide a general overview. 

 * DGL Base
 * DGL Backend 
 * DGL Data
 * DGL Dataloading
-* DGL Graph
+* DGL DGLGraph
 * DGL Function
 * DGL Ops
 * DGL Sampling
@@ -333,29 +230,26 @@ Instead of listing them all, support is grouped into the following categories to
 * DGL NN
 * DGL Optim
 * DGL Sparse
-* GraphBolt
+

 Unsupported features
 ================================================================================

-* TF32 Support (only supported for PyTorch 2.7 and above)
-* Kineto/ROCTracer integration
+* Graphbolt
+* Partial TF32 Support (MI250x only)
+* Kineto/ ROCTracer integration


 Unsupported functions
 ================================================================================

-* ``bfs``
+* ``more_nnz``
 * ``format``
 * ``multiprocess_sparse_adam_state_dict``
+* ``record_stream_ndarray``
 * ``half_spmm``
 * ``segment_mm`` 
 * ``gather_mm_idx_b``
+* ``pgexplainer``
 * ``sample_labors_prob``
 * ``sample_labors_noprob``
-* ``sparse_admin``
-
-Previous versions
-===============================================================================
-See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/dgl-history` to find documentation for previous releases
-of the ``ROCm/dgl`` Docker image.
--- a/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
@@ -1,108 +0,0 @@
-:orphan:
-
-.. meta::
-    :description: FlashInfer compatibility
-    :keywords: GPU, LLM, FlashInfer, deep learning, framework compatibility
-
-.. version-set:: rocm_version latest
-
-********************************************************************************
-FlashInfer compatibility
-********************************************************************************
-
-`FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator 
-for Large Language Models (LLMs) that provides a high-performance implementation of graphics 
-processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well 
-as advanced performance across diverse scenarios.
-
-FlashInfer features highly efficient attention kernels, load-balanced scheduling, and memory-optimized 
-techniques, while supporting customized attention variants. It’s compatible with ``torch.compile``, and 
-offers high-performance LLM-specific operators, with easy integration through PyTorch, and C++ APIs.
-
-.. note::
-
-  The ROCm port of FlashInfer is under active development, and some features are not yet available. 
-  For the latest feature compatibility matrix, refer to the ``README`` of the 
-  `https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.
-
-Support overview
-================================================================================
-
- The ROCm-supported version of FlashInfer is maintained in the official `https://github.com/ROCm/flashinfer 
-  <https://github.com/ROCm/flashinfer>`__ repository, which differs from the 
-  `https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`__ 
-  upstream repository.
-
- To get started and install FlashInfer on ROCm, use the prebuilt :ref:`Docker images <flashinfer-docker-compat>`, 
-  which include ROCm, FlashInfer, and all required dependencies.
-
-  - See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>` 
-    for installation and setup instructions.
-
-  - You can also consult the upstream `Installation guide <https://docs.flashinfer.ai/installation.html>`__ 
-    for additional context.
-
-Version support
--------------------------------------------------------------------------------
-
-FlashInfer is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
-
-Supported devices
--------------------------------------------------------------------------------
-
-**Officially Supported**: AMD Instinct™ MI300X
-
-
-.. _flashinfer-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-This release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
-In the decode phase, tokens are generated sequentially, with the model predicting each new 
-token based on the previously generated tokens and the input context.
-
-FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense 
-attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
-
-Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm 
-also implements cascade attention from upstream to reduce memory usage. 
-
-For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for examples and best practices to optimize your workloads on AMD GPUs.
-
-.. _flashinfer-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes `FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
-with ROCm backends on Docker Hub. The following Docker image tag and associated
-inventories represent the latest available FlashInfer version from the official Docker Hub. 
-Click |docker-icon| to view the image on Docker Hub.
-
-.. list-table:: 
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - ROCm
-      - FlashInfer
-      - PyTorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/flashinfer/flashinfer-0.2.5_rocm6.4_ubuntu24.04_py3.12_pytorch2.7/images/sha256-558914838821c88c557fb6d42cfbc1bdb67d79d19759f37c764a9ee801f93313"><i class="fab fa-docker fa-lg"></i> rocm/flashinfer</a>
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
-      - `v0.2.5 <https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.5>`__
-      - `2.7.1 <https://github.com/ROCm/pytorch/releases/tag/v2.7.1>`__
-      - 24.04
-      - `3.12 <https://www.python.org/downloads/release/python-3129/>`__
-
-
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
   :description: JAX compatibility
-   :keywords: GPU, JAX, deep learning, framework compatibility
+   :keywords: GPU, JAX compatibility

 .. version-set:: rocm_version latest

@@ -10,58 +10,42 @@
 JAX compatibility
 *******************************************************************************

-`JAX <https://docs.jax.dev/en/latest/notebooks/thinking_in_jax.html>`__ is a library 
-for array-oriented numerical computation (similar to NumPy), with automatic differentiation 
-and just-in-time (JIT) compilation to enable high-performance machine learning research.
+JAX provides a NumPy-like API, which combines automatic differentiation and the
+Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
+learning at scale.

-JAX provides an API that combines automatic differentiation and the 
-Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine 
-learning at scale. JAX uses composable transformations of Python and NumPy through 
-JIT compilation, automatic vectorization, and parallelization.
+JAX uses composable transformations of Python and NumPy through just-in-time
+(JIT) compilation, automatic vectorization, and parallelization. To learn about
+JAX, including profiling and optimizations, see the official `JAX documentation
+<https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.

-Support overview
-================================================================================
+ROCm support for JAX is upstreamed, and users can build the official source code
+with ROCm support:

- The ROCm-supported version of JAX is maintained in the official `https://github.com/ROCm/rocm-jax 
-  <https://github.com/ROCm/rocm-jax>`__ repository, which differs from the 
-  `https://github.com/jax-ml/jax <https://github.com/jax-ml/jax>`__ upstream repository.
+- ROCm JAX release:

- To get started and install JAX on ROCm, use the prebuilt :ref:`Docker images <jax-docker-compat>`, 
-  which include ROCm, JAX, and all required dependencies.
+  - Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
+    with ROCm and JAX preinstalled.

-  - See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>` 
-    for installation and setup instructions.
+  - ROCm JAX repository: `ROCm/jax <https://github.com/ROCm/jax>`_

-  - You can also consult the upstream `Installation guide <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`__ 
-    for additional context.
+  - See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
+    to get started.

-Version support
--------------------------------------------------------------------------------
+- Official JAX release:

-AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax/tags>`_
-quarterly alongside new ROCm releases. These images undergo full AMD testing.
-`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community/tags>`_
-follow upstream JAX releases and use the latest available ROCm version.
+  - Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_

-JAX Plugin-PJRT with JAX/JAXLIB compatibility
-================================================================================
+  - See the `AMD GPU (Linux) installation section
+    <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
+    the JAX documentation.

-Portable JIT Runtime (PJRT) is an open, stable interface for device runtime and
-compiler. The following table details the ROCm version compatibility matrix
-between JAX Plugin–PJRT and JAX/JAXLIB.
+.. note::

-.. list-table::
-    :header-rows: 1
-
-    * - JAX Plugin-PJRT
-      - JAX/JAXLIB
-      - ROCm
-    * - 0.7.1
-      - 0.7.1
-      - 7.1.1, 7.1.0
-    * - 0.6.0
-      - 0.6.2, 0.6.0
-      - 7.0.2, 7.0.1, 7.0.0
+   AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
+   quarterly alongside new ROCm releases. These images undergo full AMD testing.
+   `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
+   follow upstream JAX releases and use the latest available ROCm version.

 Use cases and recommendations
 ================================================================================
@@ -87,7 +71,7 @@ Use cases and recommendations
 * The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
  outlines the process of fine-tuning a Bidirectional Encoder Representations
  from Transformers (BERT)-based large language model (LLM) using JAX for a text
-  classification task. The blog post discusses techniques for parallelizing the
+  classification task. The blog post discuss techniques for parallelizing the
  fine-tuning across multiple AMD GPUs and assess the model's performance on a
  holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
  and the General Language Understanding Evaluation (GLUE) benchmark dataset was
@@ -95,7 +79,7 @@ Use cases and recommendations

 * The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
  provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
-  GPU using ROCm. The page is aimed at helping users achieve optimal
+  accelerator using ROCm. The page is aimed at helping users achieve optimal
  performance for deep learning and other high-performance computing tasks on
  the MI300X GPU.

@@ -106,15 +90,75 @@ For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.b
 Docker image compatibility
 ================================================================================

-AMD validates and publishes `JAX images <https://hub.docker.com/r/rocm/jax/tags>`__
-with ROCm backends on Docker Hub.
+.. |docker-icon| raw:: html

-For ``jax-community`` images, see `rocm/jax-community
-<https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.
+   <i class="fab fa-docker"></i>

-To find the right image tag, see the :ref:`JAX on ROCm installation
-documentation <rocm-install-on-linux:jax-docker-support>` for a list of
-available ``rocm/jax`` images.
+AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories represent the latest JAX version from the official Docker Hub and are validated for
+`ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`_. Click the |docker-icon|
+icon to view the image on Docker Hub.
+
+.. list-table:: JAX Docker image components
+    :header-rows: 1
+
+    * - Docker image
+      - JAX
+      - Linux
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.12/images/sha256-8918fa806a172c1a10eb2f57131eb31b5d7c8fa1656b8729fe7d3d736112de83"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+
+      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
+      - Ubuntu 24.04
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.10/images/sha256-a394be13c67b7fc602216abee51233afd4b6cb7adaa57ca97e688fba82f9ad79"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+
+      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
+      - Ubuntu 22.04
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+
+AMD publishes `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_.
+
+.. list-table:: JAX community Docker image components
+    :header-rows: 1
+
+    * - Docker image
+      - JAX
+      - Linux
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.12.8/images/sha256-25dfaa0183e274bd0a3554a309af3249c6f16a1793226cb5373f418e39d3146a"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
+
+      - `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
+      - Ubuntu 22.04
+      - `3.12.8 <https://www.python.org/downloads/release/python-3128/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.11.11/images/sha256-ff9baeca9067d13e6c279c911e5a9e5beed0817d24fafd424367cc3d5bd381d7"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
+
+      - `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
+      - Ubuntu 22.04
+      - `3.11.11 <https://www.python.org/downloads/release/python-31111/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.10.16/images/sha256-8bab484be1713655f74da51a191ed824bb9d03db1104fd63530a1ac3c37cf7b1"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
+
+      - `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
+      - Ubuntu 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_

 .. _key_rocm_libraries:

@@ -250,7 +294,7 @@ The ROCm supported data types in JAX are collected in the following table.

 .. note::

-  JAX data type support is affected by the :ref:`key_rocm_libraries` and it's
+  JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
  collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
  page.

@@ -266,54 +310,5 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
  Since version 0.1.56, JAX has full support for ROCm, and the
  :ref:`Known issues and important notes <jax_comp_known_issues>` section
  contains details about limitations specific to the ROCm backend. The list of
-  JAX API modules are maintained by the JAX project and is subject to change.
+  JAX API modules is maintained by the JAX project and is subject to change. 
  Refer to the official Jax documentation for the most up-to-date information.
-
-Key features and enhancements for ROCm 7.0
-===============================================================================
-
- Upgraded XLA backend: Integrates a newer XLA version, enabling better
-  optimizations, broader operator support, and potential performance gains.
-
- RNN support: Native RNN support (including LSTMs via ``jax.experimental.rnn``)
-  now available on ROCm, aiding sequence model development.
-
- Comprehensive linear algebra capabilities: Offers robust ``jax.linalg``
-  operations, essential for scientific and machine learning tasks.
-
- Expanded AMD GPU architecture support: Provides ongoing support for gfx1101
-  GPUs and introduces support for gfx950 and gfx12xx GPUs.
-
- Mixed FP8 precision support: Enables ``lax.dot_general`` operations with mixed FP8
-  types, offering pathways for memory and compute efficiency.
-
- Streamlined PyPi packaging: Provides reliable PyPi wheels for JAX on ROCm,
-  simplifying the installation process.
-
- Pallas experimental kernel development: Continued Pallas framework
-  enhancements for custom GPU kernels, including new intrinsics (specific
-  kernel behaviors under review).
-
- Improved build system and CI: Enhanced ROCm build system and CI for greater
-  reliability and maintainability.
-
- Enhanced distributed computing setup: Improved JAX setup in multi-GPU
-  distributed environments.
-
-.. _jax_comp_known_issues:
-
-Known issues and notes for ROCm 7.0
-===============================================================================
-
- ``nn.dot_product_attention``: Certain configurations of ``jax.nn.dot_product_attention``
-  may cause segmentation faults, though the majority of use cases work correctly.
-
- SVD with dynamic shapes: SVD on inputs with dynamic/symbolic shapes might result in an error.
-  SVD with static shapes is unaffected.
-
- QR decomposition with symbolic shapes: QR decomposition operations may fail when using
-  symbolic/dynamic shapes in shape polymorphic contexts.
-
- Pallas kernels: Specific advanced Pallas kernels may exhibit variations in
-  numerical output or resource usage. These are actively reviewed as part of
-  Pallas's experimental development.
--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -1,8 +1,8 @@
 :orphan:

 .. meta::
-    :description: llama.cpp compatibility
-    :keywords: GPU, GGML, llama.cpp, deep learning, framework compatibility
+    :description: llama.cpp deep learning framework compatibility
+    :keywords: GPU, GGML, llama.cpp compatibility

 .. version-set:: rocm_version latest

@@ -16,36 +16,37 @@ for Large Language Model (LLM) inference that runs on both central processing un
 a simple, dependency-free setup. 

 The framework supports multiple quantization options, from 1.5-bit to 8-bit integers, 
-to accelerate inference and reduce memory usage. Originally built as a CPU-first library, 
+to speed up inference and reduce memory usage. Originally built as a CPU-first library, 
 llama.cpp is easy to integrate with other programming environments and is widely 
 adopted across diverse platforms, including consumer devices. 

-Support overview
-================================================================================
+ROCm support for llama.cpp is upstreamed, and you can build the official source code
+with ROCm support:

- The ROCm-supported version of llama.cpp is maintained in the official `https://github.com/ROCm/llama.cpp 
-  <https://github.com/ROCm/llama.cpp>`__ repository, which differs from the 
-  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`__ upstream repository.
+- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp 
+  <https://github.com/ROCm/llama.cpp>`_ repository.

- To get started and install llama.cpp on ROCm, use the prebuilt :ref:`Docker images <llama-cpp-docker-compat>`, 
-  which include ROCm, llama.cpp, and all required dependencies.
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
+
+- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`, 
+  which includes ROCm, llama.cpp, and all required dependencies.

  - See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>` 
-    for installation and setup instructions.
+    to install and get started.

-  - You can also consult the upstream `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md>`__ 
-    for additional context.
+  - See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__ 
+    in the upstream llama.cpp documentation.

-Version support
--------------------------------------------------------------------------------
+.. note::

-llama.cpp is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ and 
-`ROCm 6.4.x <https://repo.radeon.com/rocm/apt/6.4/>`__.
+  llama.cpp is supported on ROCm 6.4.0.

 Supported devices
--------------------------------------------------------------------------------
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210

-**Officially Supported**: AMD Instinct™ MI325X, MI300X, MI210

 Use cases and recommendations
 ================================================================================
@@ -69,7 +70,7 @@ llama.cpp is also used in a range of real-world applications, including:
 For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
 where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.

- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__ 
+- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__, 
  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
  AMD Instinct GPUs within the ROCm ecosystem. 
@@ -83,9 +84,9 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `llama.cpp images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
+AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
 with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories represent the latest available llama.cpp versions from the official Docker Hub.
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
 Click |docker-icon| to view the image on Docker Hub.

 .. important::
@@ -104,115 +105,8 @@ Click |docker-icon| to view the image on Docker Hub.
      - Server Docker
      - Light Docker
      - llama.cpp
-      - ROCm
      - Ubuntu

-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_full/images/sha256-a94f0c7a598cc6504ff9e8371c016d7a2f93e69bf54a36c870f9522567201f10g"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_server/images/sha256-be175932c3c96e882dfbc7e20e0e834f58c89c2925f48b222837ee929dfc47ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_light/images/sha256-d8ba0c70603da502c879b1f8010b439c8e7fa9f6cbdac8bbbbbba97cb41ebc9e"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
-      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_full/images/sha256-37582168984f25dce636cc7288298e06d94472ea35f65346b3541e6422b678ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_server/images/sha256-7e70578e6c3530c6591cc2c26da24a9ee68a20d318e12241de93c83224f83720"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_light/images/sha256-9a5231acf88b4a229677bc2c636ea3fe78a7a80f558bd80910b919855de93ad5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
-      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
-      - 22.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_full/images/sha256-5960fc850024a8a76451f9eaadd89b7e59981ae9f393b407310c1ddf18892577"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_server/images/sha256-1b79775d9f546065a6aaf9ca426e1dd4ed4de0b8f6ee83687758cc05af6538e6"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_light/images/sha256-8f863c4c2857ae42bebd64e4f1a0a1e7cc3ec4503f243e32b4a4dcad070ec361"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_full/images/sha256-888879b3ee208f9247076d7984524b8d1701ac72611689e89854a1588bec9867"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_server/images/sha256-90e4ff99a66743e33fd00728cd71a768588e5f5ef355aaa196669fe65ac70672"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_light/images/sha256-bd447a049939cb99054f8fbf3f2352870fe906a75e2dc3339c845c08b9c53f9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
-      - 22.04
-
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_full/images/sha256-5b3a1bc4889c1fcade434b937fbf9cc1c22ff7dc0317c130339b0c9238bc88c4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_server/images/sha256-5228ff99d0f627a9032d668f4381b2e80dc1e301adc3e0821f26d8354b175271"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_light/images/sha256-b12723b332a826a89b7252dddf868cbe4d1a869562fc4aa4032f59e1a683b968"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_full/images/sha256-cd6e21a6a73f59b35dd5309b09dd77654a94d783bf13a55c14eb8dbf8e9c2615"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_server/images/sha256-c2b4689ab2c47e6626e8fea22d7a63eb03d47c0fde9f5ef8c9f158d15c423e58"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_light/images/sha256-1acc28f29ed87db9cbda629cb29e1989b8219884afe05f9105522be929e94da4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
-      - 22.04
-
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_full/images/sha256-2f8ae8a44510d96d52dea6cb398b224f7edeb7802df7ec488c6f63d206b3cdc9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_server/images/sha256-fece497ff9f4a28b12f645de52766941da8ead8471aa1ea84b61d4b4568e51f2"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_light/images/sha256-3e14352fa6f8c6128b23cf9342531c20dbfb522550b626e09d83b260a1947022"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_full/images/sha256-80763062ef0bec15038c35fd01267f1fc99a5dd171d4b48583cc668b15efad69"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_server/images/sha256-db2a6c957555ed83b819bbc54aea884a93192da0fb512dae63d32e0dc4e8ab8f"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_light/images/sha256-c6dbb07cc655fb079d5216e4b77451cb64a9daa0585d23b6fb8b32cb22021197"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
-      - 22.04
-
    * - .. raw:: html

           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
@@ -223,52 +117,40 @@ Click |docker-icon| to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
      - 24.04

-
 Key ROCm libraries for llama.cpp
 ================================================================================

 llama.cpp functionality on ROCm is determined by its underlying library
 dependencies. These ROCm components affect the capabilities, performance, and
-feature set available to developers. Ensure you have the required libraries for 
-your corresponding ROCm version.
+feature set available to developers.

 .. list-table::
    :header-rows: 1

    * - ROCm library
-      - ROCm 7.0.0 version
-      - ROCm 6.4.x version
+      - Version
      - Purpose
      - Usage
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
-      - 3.0.0
-      - 2.4.0
+      - :version-ref:`hipBLAS rocm_version`
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
      - Supports operations such as matrix multiplication, matrix-vector
        products, and tensor contractions. Utilized in both dense and batched
        linear algebra operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
-      - 1.0.0
-      - 0.12.0
+      - :version-ref:`hipBLASLt rocm_version`
      - hipBLASLt is an extension of the hipBLAS library, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
      - By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
        kernels where possible.
    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
-      - 2.0.0
-      - 1.7.0
+      - :version-ref:`rocWMMA rocm_version`
      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
        multiplication (GEMM) and accumulation operations with mixed precision
        support.
      - Can be used to enhance the flash attention performance on AMD compute, by enabling
-        the flag during compile time.
-
-Previous versions
-===============================================================================
-See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/llama-cpp-history` to find documentation for previous releases
-of the ``ROCm/llama.cpp`` Docker image.
+        the flag during compile time.
--- a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Megablocks compatibility
-    :keywords: GPU, megablocks, deep learning, framework compatibility
+    :keywords: GPU, megablocks, compatibility

 .. version-set:: rocm_version latest

@@ -10,42 +10,28 @@
 Megablocks compatibility
 ********************************************************************************

-`Megablocks <https://github.com/databricks/megablocks>`__ is a lightweight library 
-for mixture-of-experts `(MoE) <https://huggingface.co/blog/moe>`__ training. 
+Megablocks is a light-weight library for mixture-of-experts (MoE) training. 
 The core of the system is efficient "dropless-MoE" and standard MoE layers. 
-Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM 
-<https://github.com/stanford-futuredata/Megatron-LM>`__, 
+Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_, 
 where data and pipeline parallel training of MoEs is supported.

-Support overview
-================================================================================
+* ROCm support for Megablocks is hosted in the official `https://github.com/ROCm/megablocks <https://github.com/ROCm/megablocks>`_ repository. 
+* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
+* Use the prebuilt :ref:`Docker image <megablocks-docker-compat>` with ROCm, PyTorch, and Megablocks preinstalled. 
+* See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` to install and get started.

- The ROCm-supported version of Megablocks is maintained in the official `https://github.com/ROCm/megablocks 
-  <https://github.com/ROCm/megablocks>`__ repository, which differs from the 
-  `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
+.. note::

- To get started and install Megablocks on ROCm, use the prebuilt :ref:`Docker image <megablocks-docker-compat>`, 
-  which includes ROCm, Megablocks, and all required dependencies.
-
-  - See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` 
-    for installation and setup instructions.
-
-  - You can also consult the upstream `Installation guide <https://github.com/databricks/megablocks>`__ 
-    for additional context.
-
-Version support
--------------------------------------------------------------------------------
-
-Megablocks is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
+  Megablocks is supported on ROCm 6.3.0.

 Supported devices
--------------------------------------------------------------------------------
+================================================================================

- **Officially Supported**: AMD Instinct™ MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
+- **Officially Supported**: AMD Instinct MI300X
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X

 Supported models and features
--------------------------------------------------------------------------------
+================================================================================

 This section summarizes the Megablocks features supported by ROCm.

@@ -55,28 +41,20 @@ This section summarizes the Megablocks features supported by ROCm.
 * Mixture-of-Experts
 * dropless-Mixture-of-Experts

+
 .. _megablocks-recommendations:

 Use cases and recommendations
 ================================================================================

-* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs 
-  <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__ 
-  blog post guides how to leverage the ROCm platform for pre-training using the 
-  Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts 
-  (MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it 
-  demonstrates how block-sparse computations can enhance scalability and efficiency in MoE 
-  training. The guide provides step-by-step instructions for setting up the environment, 
-  including cloning the repository, building the Docker image, and running the training container. 
-  Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training 
-  language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE 
-  training workflows for large-scale transformer models.
-
+The `ROCm Megablocks blog posts <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ 
+guide how to leverage the ROCm platform for pre-training using the Megablocks framework. 
 It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:

 * Single-GPU pre-training
 * Multi-GPU pre-training

+
 .. _megablocks-docker-compat:

 Docker image compatibility
@@ -86,9 +64,10 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
-with ROCm backends on Docker Hub. The following Docker image tag and associated
-inventories represent the latest available Megablocks version from the official Docker Hub. 
+AMD validates and publishes `ROCm Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`_
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories represent the latest Megatron-LM version from the official Docker Hub.
+The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: PyTorch compatibility
-    :keywords: GPU, PyTorch, deep learning, framework compatibility
+    :keywords: GPU, PyTorch compatibility

 .. version-set:: rocm_version latest

@@ -15,42 +15,40 @@ deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
 using `MIOpen <https://github.com/ROCm/MIOpen>`__ and
 `RCCL <https://github.com/ROCm/rccl>`__ libraries.

-PyTorch provides two high-level features:
+ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due
+to independent compatibility considerations, this results in two distinct
+release cycles for PyTorch on ROCm:

- Tensor computation (like NumPy) with strong GPU acceleration
+- ROCm PyTorch release:

- Deep neural networks built on a tape-based autograd system (rapid computation 
-  of multiple partial derivatives or gradients)
+  - Provides the latest version of ROCm but might not necessarily support the
+    latest stable PyTorch version.

-Support overview
-================================================================================
+  - Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
+    preinstalled.

-ROCm support for PyTorch is upstreamed into the official PyTorch repository. 
-ROCm development is aligned with the stable release of PyTorch, while upstream 
-PyTorch testing uses the stable release of ROCm to maintain consistency:
+  - ROCm PyTorch repository: `<https://github.com/ROCm/pytorch>`__

- The ROCm-supported version of PyTorch is maintained in the official `https://github.com/ROCm/pytorch 
-  <https://github.com/ROCm/pytorch>`__ repository, which differs from the 
-  `https://github.com/pytorch/pytorch <https://github.com/pytorch/pytorch>`__ upstream repository.
+  - See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
+    to get started.

- To get started and install PyTorch on ROCm, use the prebuilt :ref:`Docker images <pytorch-docker-compat>`, 
-  which include ROCm, PyTorch, and all required dependencies.
+- Official PyTorch release:

-  - See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>` 
-    for installation and setup instructions.
+  - Provides the latest stable version of PyTorch  but might not necessarily
+    support the latest ROCm version.

-  - You can also consult the upstream `Installation guide <https://pytorch.org/get-started/locally/>`__ or 
-    `Previous versions <https://pytorch.org/get-started/previous-versions/>`__ for additional context.
+  - Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
+
+  - See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`__
+    or `Previous versions <https://pytorch.org/get-started/previous-versions/>`__
+    to get started.

 PyTorch includes tooling that generates HIP source code from the CUDA backend.
 This approach allows PyTorch to support ROCm without requiring manual code
 modifications. For more information, see :doc:`HIPIFY <hipify:index>`.

-Version support
--------------------------------------------------------------------------------
-
-AMD releases official `ROCm PyTorch Docker images <https://hub.docker.com/r/rocm/pytorch/tags>`_
-quarterly alongside new ROCm releases. These images undergo full AMD testing.
+ROCm development is aligned with the stable release of PyTorch, while upstream
+PyTorch testing uses the stable release of ROCm to maintain consistency.

 .. _pytorch-recommendations:

@@ -75,12 +73,12 @@ Use cases and recommendations

 * The :doc:`Instinct MI300X workload optimization guide </how-to/rocm-for-ai/inference-optimization/workload>`
  provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
-  GPU using ROCm. This guide helps users achieve optimal performance for
+  accelerator using ROCm. This guide helps users achieve optimal performance for
  deep learning and other high-performance computing tasks on the MI300X
-  GPU.
+  accelerator.

 * The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
-  describes how PyTorch integrates with ROCm for AI workloads. It outlines the
+  describes how PyTorch integrates with ROCm for AI workloads It outlines the
  use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
  GPU hardware for training and inference tasks in AI applications.

@@ -91,12 +89,141 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts <https://ro
 Docker image compatibility
 ================================================================================

-AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch/tags>`__
-with ROCm backends on Docker Hub.
+.. |docker-icon| raw:: html

-To find the right image tag, see the :ref:`PyTorch on ROCm installation
-documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of
-available ``rocm/pytorch`` images.
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`__
+with ROCm backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: PyTorch Docker image components
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker
+      - PyTorch
+      - Ubuntu
+      - Python
+      - Apex
+      - torchvision
+      - TensorBoard
+      - MAGMA
+      - UCX
+      - OMPI
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-6a287591500b4048a9556c1ecc92bc411fd3d552f6c8233bc399f18eb803e8d6"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
+      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-06b967629ba6657709f04169832cd769a11e6b491e8b1394c361d42d7a0c8b43"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
+      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-62022414217ef6de33ac5b1341e57db8a48e8573fa2ace12d48aa5edd4b99ef0"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.11_pytorch_release_2.5.1/images/sha256-469a7f74fc149aff31797e011ee41978f6a190adc69fa423b3c6a718a77bd985"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+      - 22.04
+      - `3.11 <https://www.python.org/downloads/release/python-31113/>`__
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-37f41a1cd94019688669a1b20d33ea74156e0c129ef6b8270076ef214a6a1a2c"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-60824ba83dc1b9d94164925af1f81c0235c105dd555091ec04c57e05177ead1b"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
+      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-fe944fe083312f901be6891ab4d3ffebf2eaf2cf4f5f0f435ef0b76ec714fabd"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
+      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-1d59251c47170c5b8960d1172a4dbe52f5793d8966edd778f168eaf32d56661a"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`__
+      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`__
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__

 Key ROCm libraries for PyTorch
 ================================================================================
@@ -239,8 +366,7 @@ feature set available to developers.
 Supported modules and data types
 ================================================================================

-The following section outlines the supported data types, modules, and domain
-libraries available in PyTorch on ROCm.
+The following section outlines the supported data types, modules, and domain libraries available in PyTorch on ROCm.

 Supported data types
 --------------------------------------------------------------------------------
@@ -339,7 +465,7 @@ with ROCm.
    * - Library
      - Description

-    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_
+    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_ 
      - Audio and signal processing library for PyTorch. Provides utilities for
        audio I/O, signal and data processing functions, datasets, model
        implementations, and application components for audio and speech
@@ -361,7 +487,16 @@ with ROCm.
        popular datasets, model architectures, and common image transformations
        for computer vision applications.

-    * - `torchdata <https://meta-pytorch.org/data/beta/index.html#torchdata>`_
+    * - `torchtext <https://docs.pytorch.org/text/stable/index.html>`_
+      - Text processing library for PyTorch. Provides data processing utilities
+        and popular datasets for natural language processing, including
+        tokenization, vocabulary management, and text embeddings.
+
+        **Note:** ``torchtext`` does not implement ROCm-specific kernels. 
+        ROCm acceleration is provided through the underlying PyTorch framework
+        and ROCm library integration. Only official release exists.
+
+    * - `torchdata <https://docs.pytorch.org/data/beta/index.html>`_
      - Beta library of common modular data loading primitives for easily
        constructing flexible and performant data pipelines, with features still
        in prototype stage.
@@ -398,101 +533,3 @@ with ROCm.
        dispatching.

        **Note:** Only official release exists.
-
-Key features and enhancements for PyTorch 2.9 with ROCm 7.1.1
-================================================================================
- Scaled Dot Product Attention (SDPA) upgraded to use AOTriton version 0.11b
-
- Default hipBLASLt support enabled for gfx908 architecture on ROCm 6.3 and later
-
- MIOpen now supports channels last memory format for 3D convolutions and batch normalization
-
- NHWC convolution operations in MIOpen optimized by eliminating unnecessary transpose operations
-
- Improved tensor.item() performance by removing redundant synchronization
-
- Enhanced performance for element-wise operations and reduction kernels
-
- Added support for grouped GEMM operations through fbgemm_gpu generative AI components
-
- Resolved device error in Inductor when using CUDA graph trees with HIP
-
- Corrected logsumexp scaling in AOTriton-based SDPA implementation
-
- Added stream graph capture status validation in memory copy synchronization functions
-
-Key features and enhancements for PyTorch 2.8 with ROCm 7.1
-================================================================================
-
- MIOpen deep learning optimizations: Further optimized NHWC BatchNorm feature.
-
- Added float8 support for the DeepSpeed extension, allowing for decreased
-  memory footprint and increased throughput in training and inference workloads.
-
- ``torch.nn.functional.scaled_dot_product_attention`` now calling optimized
-  flash attention kernel automatically.
-
-Key features and enhancements for PyTorch 2.7/2.8 with ROCm 7.0
-================================================================================
-
- Enhanced TunableOp framework: Introduces ``tensorfloat32`` support for
-  TunableOp operations, improved offline tuning for ScaledGEMM operations,
-  submatrix offline tuning capabilities, and better logging for BLAS operations
-  without bias vectors.
-
- Expanded GPU architecture support: Provides optimized support for newer GPU
-  architectures, including gfx1200 and gfx1201 with preferred hipBLASLt backend
-  selection, along with improvements for gfx950 and gfx1100 Series GPUs.
-
- Advanced Triton Integration: AOTriton 0.10b introduces official support for
-  gfx950 and gfx1201, along with experimental support for gfx1101, gfx1151,
-  gfx1150, and gfx1200.
-
- Improved element-wise kernel performance: Delivers enhanced vectorized
-  element-wise kernels with better support for heterogeneous tensor types and
-  optimized input vectorization for tensors with mixed data types.
-
- MIOpen deep learning optimizations: Enables NHWC BatchNorm by default on
-  ROCm 7.0+, provides ``maxpool`` forward and backward performance improvements
-  targeting ResNet scenarios, and includes updated launch configurations for
-  better performance.
-
- Enhanced memory and tensor operations: Features fixes for in-place ``aten``
-  sum operations with specialized templated kernels, improved 3D tensor
-  performance with NHWC format, and better handling of memory-bound matrix
-  multiplication operations.
-
- Robust testing and quality improvements: Includes comprehensive test suite
-  updates with improved tolerance handling for Navi3x architectures, generalized
-  ROCm-specific test conditions, and enhanced unit test coverage for Flash
-  Attention and Memory Efficient operations.
-
- Composable Kernel (CK) updates: Features updated CK submodule integration with
-  the latest optimizations and performance improvements for core mathematical
-  operations.
-
- Development and debugging enhancements: Includes improved source handling for
-  dynamic compilation, better error handling for atomic operations, and enhanced
-  state checking for trace operations.
-
- Integrate APEX fused layer normalization, which can have positive impact on
-  text-to-video models.
-
- Integrate APEX distributed fused LAMB and distributed fused ADAM, which can
-  have positive impact on BERT-L and Llama2-SFT.
-
- FlashAttention v3 has been integrated for AMD GPUs.
-
- `Pytorch C++ extensions <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_
-  provide a mechanism for compiling custom operations that can be used during
-  network training or inference. For AMD platforms, ``amdclang++`` has been
-  validated as the supported compiler for building these extensions.
-
-Known issues and notes for PyTorch 2.7/2.8 with ROCm 7.0 and ROCm 7.1
-================================================================================
-
- The ``matmul.allow_fp16_reduced_precision_reduction`` and
-  ``matmul.allow_bf16_reduced_precision_reduction`` options under
-  ``torch.backends.cuda`` are not supported. As a result,
-  reduced-precision reductions using FP16 or BF16 accumulation types are not
-  available.
--- a/docs/compatibility/ml-compatibility/ray-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -1,8 +1,8 @@
 :orphan:

 .. meta::
-    :description: Ray compatibility
-    :keywords: GPU, Ray, deep learning, framework compatibility
+    :description: Ray deep learning framework compatibility
+    :keywords: GPU, Ray compatibility

 .. version-set:: rocm_version latest

@@ -19,36 +19,37 @@ simplifying machine learning computations.
 Ray is a general-purpose framework that runs many types of workloads efficiently. 
 Any Python application can be scaled with Ray, without extra infrastructure.

-Support overview
-================================================================================
+ROCm support for Ray is upstreamed, and you can build the official source code
+with ROCm support: 

- The ROCm-supported version of Ray is maintained in the official `https://github.com/ROCm/ray 
-  <https://github.com/ROCm/ray>`__ repository, which differs from the 
-  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`__ upstream repository.
+- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray 
+  <https://github.com/ROCm/ray>`_ repository.

- To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image <ray-docker-compat>`, 
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
+
+- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>` 
  which includes ROCm, Ray, and all required dependencies.

-  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels 
-    <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
+  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
+    for instructions to get started.
+
+  - See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_ 
+    in the upstream Ray documentation.
+
+  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.

-  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
-    for installation and setup instructions.
+.. note::

-  - You can also consult the upstream `Installation guide <https://docs.ray.io/en/latest/ray-overview/installation.html>`__ 
-    for additional context.
-
-Version support
--------------------------------------------------------------------------------
-
-Ray is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
+  Ray is supported on ROCm 6.4.1.

 Supported devices
--------------------------------------------------------------------------------
+================================================================================

 **Officially Supported**: AMD Instinct™ MI300X, MI210

+
 Use cases and recommendations
 ================================================================================

@@ -87,15 +88,15 @@ Docker image compatibility

 AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
 with ROCm backends on Docker Hub. The following Docker image tags and
-associated inventories represent the latest Ray version from the official Docker Hub.
-Click the |docker-icon| icon to view the image on Docker Hub.
+associated inventories represent the latest Ray version from the official Docker Hub and are validated for
+`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
+icon to view the image on Docker Hub.

 .. list-table::
    :header-rows: 1
    :class: docker-image-compatibility

    * - Docker image
-      - ROCm
      - Ray
      - Pytorch
      - Ubuntu
@@ -104,7 +105,6 @@ Click the |docker-icon| icon to view the image on Docker Hub.
    * - .. raw:: html

           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
      - 2.6.0+git684f6f2
      - 24.04
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Stanford Megatron-LM compatibility
-    :keywords: Stanford, Megatron-LM, deep learning, framework compatibility
+    :keywords: Stanford, Megatron-LM, compatibility

 .. version-set:: rocm_version latest

@@ -10,50 +10,34 @@
 Stanford Megatron-LM compatibility
 ********************************************************************************

-Stanford Megatron-LM is a large-scale language model training framework developed 
-by NVIDIA at `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. 
-It is designed to train massive transformer-based language models efficiently by model 
-and data parallelism. 
+Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
+designed to train massive transformer-based language models efficiently by model and data parallelism. 

-It provides efficient tensor, pipeline, and sequence-based model parallelism for 
-pre-training transformer-based language models such as GPT (Decoder Only), BERT 
-(Encoder Only), and T5 (Encoder-Decoder). 
+* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository. 
+* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
+* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled. 
+* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.

-Support overview
+.. note::
+
+	Stanford Megatron-LM is supported on ROCm 6.3.0.
+
+
+Supported Devices
 ================================================================================

- The ROCm-supported version of Stanford Megatron-LM is maintained in the official `https://github.com/ROCm/Stanford-Megatron-LM 
-  <https://github.com/ROCm/Stanford-Megatron-LM>`__ repository, which differs from the 
-  `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
+- **Officially Supported**: AMD Instinct MI300X
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X

- To get started and install Stanford Megatron-LM on ROCm, use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>`, 
-  which includes ROCm, Stanford Megatron-LM, and all required dependencies.
-
-  - See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` 
-    for installation and setup instructions.
-
-  - You can also consult the upstream `Installation guide <https://github.com/NVIDIA/Megatron-LM>`__ 
-    for additional context.
-
-Version support
--------------------------------------------------------------------------------
-
-Stanford Megatron-LM is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
-
-Supported devices
--------------------------------------------------------------------------------
-
- **Officially Supported**: AMD Instinct™ MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210

 Supported models and features
--------------------------------------------------------------------------------
+================================================================================

 This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.

 Models:

-* BERT
+* Bert
 * GPT
 * T5
 * ICT
@@ -70,24 +54,13 @@ Features:
 Use cases and recommendations
 ================================================================================

-The following blog post mentions Megablocks, but you can run Stanford Megatron-LM with the same steps to pre-process datasets on AMD GPUs:
+See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post  
+to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs. 
+Coverage includes:

-* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs 
-  <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__ 
-  blog post guides how to leverage the ROCm platform for pre-training using the 
-  Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts 
-  (MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it 
-  demonstrates how block-sparse computations can enhance scalability and efficiency in MoE 
-  training. The guide provides step-by-step instructions for setting up the environment, 
-  including cloning the repository, building the Docker image, and running the training container. 
-  Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training 
-  language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE 
-  training workflows for large-scale transformer models.
+  * Single-GPU pre-training
+  * Multi-GPU pre-training

-It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
-
-* Single-GPU pre-training
-* Multi-GPU pre-training

 .. _megatron-lm-docker-compat:

@@ -98,9 +71,10 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
+AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
 with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
+inventories represent the latest Megatron-LM version from the official Docker Hub.
+The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
@@ -108,7 +82,6 @@ Click |docker-icon| to view the image on Docker Hub.
    :class: docker-image-compatibility

    * - Docker image
-      - ROCm
      - Stanford Megatron-LM
      - PyTorch
      - Ubuntu
@@ -118,7 +91,6 @@ Click |docker-icon| to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>

-      - `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
      - `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 24.04
--- a/docs/compatibility/ml-compatibility/taichi-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/taichi-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Taichi compatibility
-    :keywords: GPU, Taichi, deep learning, framework compatibility
+    :keywords: GPU, Taichi compatibility

 .. version-set:: rocm_version latest

@@ -19,52 +19,28 @@ Taichi is widely used across various domains, including real-time physical simul
 numerical computing, augmented reality, artificial intelligence, computer vision, robotics, 
 visual effects in film and gaming, and general-purpose computing.

-Support overview
-================================================================================
+* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository.
+* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev <https://github.com/taichi-dev>`_ upstream repository.
+* Use the prebuilt :ref:`Docker image <taichi-docker-compat>` with ROCm, PyTorch, and Taichi preinstalled.
+* See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` to install and get started.

- The ROCm-supported version of Taichi is maintained in the official `https://github.com/ROCm/taichi 
-  <https://github.com/ROCm/taichi>`__ repository, which differs from the 
-  `https://github.com/taichi-dev/taichi <https://github.com/taichi-dev/taichi>`__ upstream repository.
+.. note::

- To get started and install Taichi on ROCm, use the prebuilt :ref:`Docker image <taichi-docker-compat>`, 
-  which includes ROCm, Taichi, and all required dependencies.
+	Taichi is supported on ROCm 6.3.2.

-  - See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` 
-    for installation and setup instructions.
-
-  - You can also consult the upstream `Installation guide <https://github.com/taichi-dev/taichi>`__ 
-    for additional context.
-
-Version support
--------------------------------------------------------------------------------
-
-Taichi is supported on `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`__.
-
-Supported devices
--------------------------------------------------------------------------------
-
- **Officially Supported**: AMD Instinct™ MI250X, MI210X (with the exception of Taichi’s GPU rendering system, CGUI)
- **Upcoming Support**: AMD Instinct™ MI300X
+Supported devices and features
+===============================================================================
+There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
+AMD Instinct MI300X series GPUs will be supported by November.

 .. _taichi-recommendations:

 Use cases and recommendations
 ================================================================================
-
-* The `Accelerating Parallel Programming in Python with Taichi Lang on AMD GPUs 
-  <https://rocm.blogs.amd.com/artificial-intelligence/taichi/README.html>`__
-  blog highlights Taichi as an open-source programming language designed for high-performance 
-  numerical computation, particularly in domains like real-time physical simulation, 
-  artificial intelligence, computer vision, robotics, and visual effects. Taichi 
-  is embedded in Python and uses just-in-time (JIT) compilation frameworks like 
-  LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility 
-  of Taichi in enabling complex simulations and numerical algorithms, making 
-  it ideal for developers working on compute-intensive tasks. Developers are 
-  encouraged to follow recommended coding patterns and utilize Taichi decorators 
-  for performance optimization, with examples available in the `https://github.com/ROCm/taichi_examples 
-  <https://github.com/ROCm/taichi_examples>`_ repository. Prebuilt Docker images 
-  integrating ROCm, PyTorch, and Taichi are provided for simplified installation 
-  and deployment, making it easier to leverage Taichi for advanced computational workloads.
+To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators. 
+A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples <https://github.com/ROCm/taichi_examples>`_ repository, 
+providing practical insights and foundational knowledge for working with the Taichi programming language. 
+You can also refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.

 .. _taichi-docker-compat:

@@ -76,8 +52,9 @@ Docker image compatibility
   <i class="fab fa-docker"></i>

 AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
-with ROCm backends on Docker Hub. The following Docker image tag and associated inventories 
+with ROCm backends on Docker Hub. The following Docker image tags and associated inventories 
 represent the latest Taichi version from the official Docker Hub.
+The Docker images have been validated for `ROCm 6.3.2 <https://rocm.docs.amd.com/en/docs-6.3.2/about/release-notes.html>`_. 
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: TensorFlow compatibility
-    :keywords: GPU, TensorFlow, deep learning, framework compatibility
+    :keywords: GPU, TensorFlow compatibility

 .. version-set:: rocm_version latest

@@ -12,46 +12,115 @@ TensorFlow compatibility

 `TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
 solving machine learning, deep learning, and AI problems. It can solve many
-problems across different sectors and industries, but primarily focuses on
-neural network training and inference. It is one of the most popular deep 
-learning frameworks and is very active in open-source development.
-
-Support overview
-================================================================================
-
- The ROCm-supported version of TensorFlow is maintained in the official `https://github.com/ROCm/tensorflow-upstream 
-  <https://github.com/ROCm/tensorflow-upstream>`__ repository, which differs from the 
-  `https://github.com/tensorflow/tensorflow <https://github.com/tensorflow/tensorflow>`__ upstream repository.
-
- To get started and install TensorFlow on ROCm, use the prebuilt :ref:`Docker images <tensorflow-docker-compat>`, 
-  which include ROCm, TensorFlow, and all required dependencies.
-
-  - See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>` 
-    for installation and setup instructions.
-
-  - You can also consult the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list 
-    for additional context.
-
-Version support
--------------------------------------------------------------------------------
+problems across different sectors and industries but primarily focuses on
+neural network training and inference. It is one of the most popular and
+in-demand frameworks and is very active in open-source contribution and
+development.

 The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
 includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
 <http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
-fixes, updates, and support for the latest ROCm versions.
+fixes, updates, and support for the latest ROCM versions.
+
+- ROCm TensorFlow release:
+
+  - Offers :ref:`Docker images <tensorflow-docker-compat>` with
+    ROCm and TensorFlow pre-installed.
+
+  - ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`__
+
+  - See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
+    to get started.
+
+- Official TensorFlow release:
+
+  - Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`__
+
+  - See the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list.
+
+  .. note::
+
+     The official TensorFlow documentation does not cover ROCm support. Use the
+     ROCm documentation for installation instructions for Tensorflow on ROCm.
+     See :doc:`rocm-install-on-linux:install/3rd-party/tensorflow-install`.

 .. _tensorflow-docker-compat:

 Docker image compatibility
-================================================================================
+===============================================================================

-AMD provides preconfigured Docker images with TensorFlow and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/tensorflow>`__ and are the
-recommended way to get started with deep learning with TensorFlow on ROCm.
+.. |docker-icon| raw:: html

-To find the right image tag, see the :ref:`TensorFlow on ROCm installation
-documentation <rocm-install-on-linux:tensorflow-docker-support>` for a list of
-available ``rocm/tensorflow`` images.
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `TensorFlow images
+<https://hub.docker.com/r/rocm/tensorflow>`__ with ROCm backends on
+Docker Hub. The following Docker image tags and associated inventories are
+validated for `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__. Click
+the |docker-icon| icon to view the image on Docker Hub.
+
+.. list-table:: TensorFlow Docker image components
+    :header-rows: 1
+
+    * - Docker image
+      - TensorFlow
+      - Ubuntu
+      - Python
+      - TensorBoard
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.18-dev/images/sha256-96754ce2d30f729e19b497279915b5212ba33d5e408e7e5dd3f2304d87e3441e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - 24.04
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.18-dev/images/sha256-fa741508d383858e86985a9efac85174529127408102558ae2e3a4ac894eea1e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.17-dev/images/sha256-3a0aef09f2a8833c2b64b85874dd9449ffc2ad257351857338ff5b706c03a418"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - 24.04
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.17-dev/images/sha256-bc7341a41ebe7ab261aa100732874507c452421ef733e408ac4f05ed453b0bc5"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.16-dev/images/sha256-4841a8df7c340dab79bf9362dad687797649a00d594e0832eb83ea6880a40d3b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - 24.04
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.16-dev/images/sha256-883fa95aba960c58a3e46fceaa18f03ede2c7df89b8e9fd603ab2d47e0852897"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__


 Critical ROCm libraries for TensorFlow
@@ -136,7 +205,7 @@ The following section maps supported data types and GPU-accelerated TensorFlow
 features to their minimum supported ROCm and TensorFlow versions.

 Data types
---------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 The data type of a tensor is specified using the ``dtype`` attribute or
 argument, and TensorFlow supports a wide range of data types for different use
@@ -254,7 +323,7 @@ are as follows:
      - 1.7

 Features
---------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 This table provides an overview of key features in TensorFlow and their
 availability in ROCm.
@@ -346,7 +415,7 @@ availability in ROCm.
      - 1.9.2

 Distributed library features
-----------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Enables developers to scale computations across multiple devices on a single machine or
 across multiple machines.
--- a/docs/compatibility/ml-compatibility/verl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/verl-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
   :description: verl compatibility
-   :keywords: GPU, verl, deep learning, framework compatibility
+   :keywords: GPU, verl compatibility

 .. version-set:: rocm_version latest

@@ -10,58 +10,24 @@
 verl compatibility
 *******************************************************************************

-Volcano Engine Reinforcement Learning for LLMs (`verl <https://verl.readthedocs.io/en/latest/>`__)  
-is a reinforcement learning framework designed for large language models (LLMs). 
-verl offers a scalable, open-source fine-tuning solution by using a hybrid programming model 
-that makes it easy to define and run complex post-training dataflows efficiently. 
+Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs). 
+verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.

-Its modular APIs separate computation from data, allowing smooth integration with other frameworks. 
-It also supports flexible model placement across GPUs for efficient scaling on different cluster sizes.
-verl achieves high training and generation throughput by building on existing LLM frameworks. 
-Its 3D-HybridEngine reduces memory use and communication overhead when switching between training 
-and inference, improving overall performance.
+* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl. 
+* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
+* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled. 
+* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to install and get started.

-Support overview
-================================================================================
+.. note::

- The ROCm-supported version of verl is maintained in the official `https://github.com/ROCm/verl 
-  <https://github.com/ROCm/verl>`__ repository, which differs from the 
-  `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`__ upstream repository.
-
- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image <verl-docker-compat>`, 
-  which includes ROCm, verl, and all required dependencies.
-
-  - See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` 
-    for installation and setup instructions.
-
-  - You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__ 
-    for additional context.
-
-Version support
--------------------------------------------------------------------------------
-
-verl is supported on `ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__.
-
-Supported devices
--------------------------------------------------------------------------------
-
-**Officially Supported**: AMD Instinct™ MI300X
+	verl is supported on ROCm 6.2.0.

 .. _verl-recommendations:

 Use cases and recommendations
 ================================================================================

-* The benefits of verl in large-scale reinforcement learning from human feedback 
-  (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD 
-  GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__ 
-  blog. The blog post outlines how the Volcano Engine Reinforcement Learning 
-  (verl) framework integrates with the AMD ROCm platform to optimize training on 
-  Instinct™ MI300X GPUs. The guide details the process of building a Docker image, 
-  setting up single-node and multi-node training environments, and highlights 
-  performance benchmarks demonstrating improved throughput and convergence accuracy. 
-  This resource serves as a comprehensive starting point for deploying verl on AMD GPUs, 
-  facilitating efficient RLHF training workflows.
+The benefits of verl in large-scale reinforcement learning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.

 .. _verl-supported_features:

@@ -95,10 +61,8 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes ready-made `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
-with ROCm backends on Docker Hub. The following Docker image tag and associated inventories 
-represent the latest verl version from the official Docker Hub. 
-Click |docker-icon| to view the image on Docker Hub.
+AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
+with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the available verl versions from the official Docker Hub. 

 .. list-table:: 
    :header-rows: 1
--- a/docs/conceptual/gpu-arch.md
+++ b/docs/conceptual/gpu-arch.md
@@ -13,22 +13,21 @@
 :gutter: 1

 :::{grid-item-card}
-**AMD Instinct MI300 Series**
+**AMD Instinct MI300 series**

-Review hardware aspects of the AMD Instinct™ MI300 Series GPUs and the CDNA™ 3
+Review hardware aspects of the AMD Instinct™ MI300 series of GPU accelerators and the CDNA™ 3
 architecture.

 * [AMD Instinct™ MI300 microarchitecture](./gpu-arch/mi300.md)
 * [AMD Instinct MI300/CDNA3 ISA](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf)
 * [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf)
-* [MI300 performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
-* [MI350 Series performance counters](./gpu-arch/mi350-performance-counters.rst)
+* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
 :::

 :::{grid-item-card}
-**AMD Instinct MI200 Series**
+**AMD Instinct MI200 series**

-Review hardware aspects of the AMD Instinct™ MI200 Series GPUs and the CDNA™ 2
+Review hardware aspects of the AMD Instinct™ MI200 series of GPU accelerators and the CDNA™ 2
 architecture.

 * [AMD Instinct™ MI250 microarchitecture](./gpu-arch/mi250.md)
@@ -41,7 +40,7 @@ architecture.
 :::{grid-item-card}
 **AMD Instinct MI100**

-Review hardware aspects of the AMD Instinct™ MI100 Series GPUs and the CDNA™ 1
+Review hardware aspects of the AMD Instinct™ MI100 series of GPU accelerators and the CDNA™ 1
 architecture.

 * [AMD Instinct™ MI100 microarchitecture](./gpu-arch/mi100.md)
--- a/docs/conceptual/gpu-arch/mi100.md
+++ b/docs/conceptual/gpu-arch/mi100.md
@@ -1,14 +1,14 @@
 ---
 myst:
  html_meta:
-    "description lang=en": "Learn about the AMD Instinct MI100 Series architecture."
+    "description lang=en": "Learn about the AMD Instinct MI100 series architecture."
    "keywords": "Instinct, MI100, microarchitecture, AMD, ROCm"
 ---

 # AMD Instinct™ MI100 microarchitecture

 The following image shows the node-level architecture of a system that
-comprises two AMD EPYC™ processors and (up to) eight AMD Instinct™ GPUs.
+comprises two AMD EPYC™ processors and (up to) eight AMD Instinct™ accelerators.
 The two EPYC processors are connected to each other with the AMD Infinity™
 fabric which provides a high-bandwidth (up to 18 GT/sec) and coherent links such
 that each processor can access the available node memory as a single
@@ -18,29 +18,29 @@ available to connect the processors plus one PCIe Gen 4 x16 link per processor
 can attach additional I/O devices such as the host adapters for the network
 fabric.

-![Structure of a single GCD in the AMD Instinct MI100 GPU](../../data/conceptual/gpu-arch/image004.png "Node-level system architecture with two AMD EPYC™ processors and eight AMD Instinct™ GPUs.")
+![Structure of a single GCD in the AMD Instinct MI100 accelerator](../../data/conceptual/gpu-arch/image004.png "Node-level system architecture with two AMD EPYC™ processors and eight AMD Instinct™ accelerators.")

 In a typical node configuration, each processor can host up to four AMD
-Instinct™ GPUs that are attached using PCIe Gen 4 links at 16 GT/sec,
+Instinct™ accelerators that are attached using PCIe Gen 4 links at 16 GT/sec,
 which corresponds to a peak bidirectional link bandwidth of 32 GB/sec. Each hive
-of four GPUs can participate in a fully connected, coherent AMD
-Instinct™ fabric that connects the four GPUs using 23 GT/sec AMD
+of four accelerators can participate in a fully connected, coherent AMD
+Instinct™ fabric that connects the four accelerators using 23 GT/sec AMD
 Infinity fabric links that run at a higher frequency than the inter-processor
 links. This inter-GPU link can be established in certified server systems if the
 GPUs are mounted in neighboring PCIe slots by installing the AMD Infinity
-Fabric™ bridge for the AMD Instinct™ GPUs.
+Fabric™ bridge for the AMD Instinct™ accelerators.

 ## Microarchitecture

-The microarchitecture of the AMD Instinct GPUs is based on the AMD CDNA
+The microarchitecture of the AMD Instinct accelerators is based on the AMD CDNA
 architecture, which targets compute applications such as high-performance
 computing (HPC) and AI & machine learning (ML) that run on everything from
 individual servers to the world's largest exascale supercomputers. The overall
 system architecture is designed for extreme scalability and compute performance.

-![Structure of the AMD Instinct GPU (MI100 generation)](../../data/conceptual/gpu-arch/image005.png "Structure of the AMD Instinct GPU (MI100 generation)")
+![Structure of the AMD Instinct accelerator (MI100 generation)](../../data/conceptual/gpu-arch/image005.png "Structure of the AMD Instinct accelerator (MI100 generation)")

-The above image shows the AMD Instinct GPU with its PCIe Gen 4 x16
+The above image shows the AMD Instinct accelerator with its PCIe Gen 4 x16
 link (16 GT/sec, at the bottom) that connects the GPU to (one of) the host
 processor(s). It also shows the three AMD Infinity Fabric ports that provide
 high-speed links (23 GT/sec, also at the bottom) to the other GPUs of the local
@@ -48,7 +48,7 @@ hive.

 On the left and right of the floor plan, the High Bandwidth Memory (HBM)
 attaches via the GPU memory controller.  The MI100 generation of the AMD
-Instinct GPU offers four stacks of HBM generation 2 (HBM2) for a total
+Instinct accelerator offers four stacks of HBM generation 2 (HBM2) for a total
 of 32GB with a 4,096bit-wide memory interface. The peak memory bandwidth of the
 attached HBM2 is 1.228 TB/sec at a memory clock frequency of 1.2 GHz.

@@ -64,7 +64,7 @@ Therefore, the theoretical maximum FP64 peak performance is 11.5 TFLOPS
 ![Block diagram of an MI100 compute unit with detailed SIMD view of the AMD CDNA architecture](../../data/conceptual/gpu-arch/image006.png "An MI100 compute unit with detailed SIMD view of the AMD CDNA architecture")

 The preceding image shows the block diagram of a single CU of an AMD Instinct™
-MI100 GPU and summarizes how instructions flow through the execution
+MI100 accelerator and summarizes how instructions flow through the execution
 engines. The CU fetches the instructions via a 32KB instruction cache and moves
 them forward to execution via a dispatcher. The CU can handle up to ten
 wavefronts at a time and feed their instructions into the execution unit. The
--- a/docs/conceptual/gpu-arch/mi250.md
+++ b/docs/conceptual/gpu-arch/mi250.md
@@ -1,13 +1,13 @@
 ---
 myst:
  html_meta:
-    "description lang=en": "Learn about the AMD Instinct MI250 Series architecture."
+    "description lang=en": "Learn about the AMD Instinct MI250 series architecture."
    "keywords": "Instinct, MI250, microarchitecture, AMD, ROCm"
 ---

 # AMD Instinct™ MI250 microarchitecture

-The microarchitecture of the AMD Instinct MI250 GPU is based on the
+The microarchitecture of the AMD Instinct MI250 accelerators is based on the
 AMD CDNA 2 architecture that targets compute applications such as HPC,
 artificial intelligence (AI), and machine learning (ML) and that run on
 everything from individual servers to the world’s largest exascale
@@ -40,7 +40,7 @@ execution units (also called matrix cores), which are geared toward executing
 matrix operations like matrix-matrix multiplications. For FP64, the peak
 performance of these units amounts to 90.5 TFLOPS.

-![Structure of a single GCD in the AMD Instinct MI250 GPU.](../../data/conceptual/gpu-arch/image001.png "Structure of a single GCD in the AMD Instinct MI250 GPU.")
+![Structure of a single GCD in the AMD Instinct MI250 accelerator.](../../data/conceptual/gpu-arch/image001.png "Structure of a single GCD in the AMD Instinct MI250 accelerator.")

 ```{list-table} Peak-performance capabilities of the MI250 OAM for different data types.
 :header-rows: 1
@@ -84,9 +84,16 @@ performance of these units amounts to 90.5 TFLOPS.
  - 362.1
 ```

-The above table summarizes the aggregated peak performance of the AMD Instinct MI250 Open Compute Platform (OCP) Open Accelerator Modules (OAMs) and its two GCDs for different data types and execution units. The middle column lists the peak performance (number of data elements processed in a single instruction) of a single compute unit if a SIMD (or matrix) instruction is being retired in each clock cycle. The third column lists the theoretical peak performance of the OAM module. The theoretical aggregated peak memory bandwidth of the GPU is 3.2 TB/sec (1.6 TB/sec per GCD).
+The above table summarizes the aggregated peak performance of the AMD
+Instinct MI250 OCP Open Accelerator Modules (OAM, OCP is short for Open Compute
+Platform) and its two GCDs for different data types and execution units. The
+middle column lists the peak performance (number of data elements processed in a
+single instruction) of a single compute unit if a SIMD (or matrix) instruction
+is being retired in each clock cycle. The third column lists the theoretical
+peak performance of the OAM module. The theoretical aggregated peak memory
+bandwidth of the GPU is 3.2 TB/sec (1.6 TB/sec per GCD).

-![Dual-GCD architecture of the AMD Instinct MI250 GPUs](../../data/conceptual/gpu-arch/image002.png "Dual-GCD architecture of the AMD Instinct MI250 GPUs")
+![Dual-GCD architecture of the AMD Instinct MI250 accelerators](../../data/conceptual/gpu-arch/image002.png "Dual-GCD architecture of the AMD Instinct MI250 accelerators")

 The following image shows the block diagram of an OAM package that consists
 of two GCDs, each of which constitutes one GPU device in the system. The two
@@ -98,18 +105,18 @@ between the two GCDs of an OAM, or a bidirectional peak transfer bandwidth of
 ## Node-level architecture

 The following image shows the node-level architecture of a system that is
-based on the AMD Instinct MI250 GPU. The MI250 OAMs attach to the host
+based on the AMD Instinct MI250 accelerator. The MI250 OAMs attach to the host
 system via PCIe Gen 4 x16 links (yellow lines). Each GCD maintains its own PCIe
 x16 link to the host part of the system. Depending on the server platform, the
 GCD can attach to the AMD EPYC processor directly or via an optional PCIe switch
 . Note that some platforms may offer an x8 interface to the GCDs, which reduces
 the available host-to-GPU bandwidth.

-![Block diagram of AMD Instinct MI250 GPUs with 3rd Generation AMD EPYC processor](../../data/conceptual/gpu-arch/image003.png "Block diagram of AMD Instinct MI250 GPUs with 3rd Generation AMD EPYC processor")
+![Block diagram of AMD Instinct MI250 Accelerators with 3rd Generation AMD EPYC processor](../../data/conceptual/gpu-arch/image003.png "Block diagram of AMD Instinct MI250 Accelerators with 3rd Generation AMD EPYC processor")

 The preceding image shows the node-level architecture of a system with AMD
 EPYC processors in a dual-socket configuration and four AMD Instinct MI250
-GPUs. The MI250 OAMs attach to the host processors system via PCIe Gen 4
+accelerators. The MI250 OAMs attach to the host processors system via PCIe Gen 4
 x16 links (yellow lines). Depending on the system design, a PCIe switch may
 exist to make more PCIe lanes available for additional components like network
 interfaces and/or storage devices. Each GCD maintains its own PCIe x16 link to
--- a/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
@@ -1,16 +1,16 @@
 .. meta::
-  :description: MI300 and MI200 Series performance counters and metrics
+  :description: MI300 and MI200 series performance counters and metrics
  :keywords: MI300, MI200, performance counters, command processor counters

 ***************************************************************************************************
-MI300 and MI200 Series performance counters and metrics
+MI300 and MI200 series performance counters and metrics
 ***************************************************************************************************

 This document lists and describes the hardware performance counters and derived metrics available
 for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
 :doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.

-MI300 and MI200 Series performance counters
+MI300 and MI200 series performance counters
 ===============================================================

 Series performance counters include the following categories:
@@ -27,7 +27,7 @@ The following sections provide additional details for each category.

 .. note::

-  Preliminary validation of all MI300 and MI200 Series performance counters is in progress. Those with
+  Preliminary validation of all MI300 and MI200 series performance counters is in progress. Those with
  an asterisk (*) require further evaluation.

 .. _command-processor-counters:
@@ -171,7 +171,7 @@ Instruction mix
  "``SQ_INSTS_SMEM``", "Instr", "Number of scalar memory instructions issued"
  "``SQ_INSTS_SMEM_NORM``", "Instr", "Number of scalar memory instructions normalized to match ``smem_level`` issued"
  "``SQ_INSTS_FLAT``", "Instr", "Number of flat instructions issued"
-  "``SQ_INSTS_FLAT_LDS_ONLY``", "Instr", "**MI200 Series only** Number of FLAT instructions that read/write only from/to LDS issued. Works only if ``EARLY_TA_DONE`` is enabled."
+  "``SQ_INSTS_FLAT_LDS_ONLY``", "Instr", "**MI200 series only** Number of FLAT instructions that read/write only from/to LDS issued. Works only if ``EARLY_TA_DONE`` is enabled."
  "``SQ_INSTS_LDS``", "Instr", "Number of LDS instructions issued **(MI200: includes flat; MI300: does not include flat)**"
  "``SQ_INSTS_GDS``", "Instr", "Number of global data share instructions issued"
  "``SQ_INSTS_EXP_GDS``", "Instr", "Number of EXP and global data share instructions excluding skipped export instructions issued"
@@ -396,9 +396,9 @@ Texture cache per pipe counters
  "``TCP_UTCL1_TRANSLATION_MISS[n]``", "Req", "Number of unified translation cache (L1) translation misses", "0-15"
  "``TCP_UTCL1_PERMISSION_MISS[n]``", "Req", "Number of unified translation cache (L1) permission misses", "0-15"
  "``TCP_TOTAL_CACHE_ACCESSES[n]``", "Req", "Number of vector L1d cache accesses including hits and misses", "0-15"
-  "``TCP_TCP_LATENCY[n]``", "Cycles", "**MI200 Series only** Accumulated wave access latency to vL1D over all wavefronts", "0-15"
-  "``TCP_TCC_READ_REQ_LATENCY[n]``", "Cycles", "**MI200 Series only** Total vL1D to L2 request latency over all wavefronts for reads and atomics with return", "0-15"
-  "``TCP_TCC_WRITE_REQ_LATENCY[n]``", "Cycles", "**MI200 Series only** Total vL1D to L2 request latency over all wavefronts for writes and atomics without return", "0-15"
+  "``TCP_TCP_LATENCY[n]``", "Cycles", "**MI200 series only** Accumulated wave access latency to vL1D over all wavefronts", "0-15"
+  "``TCP_TCC_READ_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for reads and atomics with return", "0-15"
+  "``TCP_TCC_WRITE_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for writes and atomics without return", "0-15"
  "``TCP_TCC_READ_REQ[n]``", "Req", "Number of read requests to L2 cache", "0-15"
  "``TCP_TCC_WRITE_REQ[n]``", "Req", "Number of write requests to L2 cache", "0-15"
  "``TCP_TCC_ATOMIC_WITH_RET_REQ[n]``", "Req", "Number of atomic requests to L2 cache with return", "0-15"
@@ -560,7 +560,7 @@ Note the following:
  ``TCC_TAG_STALL[n]``, probes can stall the pipeline at a variety of places. There is no single point that
  can accurately measure the total stalls

-MI300 and MI200 Series derived metrics list
+MI300 and MI200 series derived metrics list
 ==============================================================

 .. csv-table::
--- a/docs/conceptual/gpu-arch/mi300.md
+++ b/docs/conceptual/gpu-arch/mi300.md
@@ -1,21 +1,21 @@
 ---
 myst:
  html_meta:
-    "description lang=en": "Learn about the AMD Instinct MI300 Series architecture."
+    "description lang=en": "Learn about the AMD Instinct MI300 series architecture."
    "keywords": "Instinct, MI300X, MI300A, microarchitecture, AMD, ROCm"
 ---

-# AMD Instinct™ MI300 Series microarchitecture
+# AMD Instinct™ MI300 series microarchitecture

-The AMD Instinct MI300 Series GPUs are based on the AMD CDNA 3
+The AMD Instinct MI300 series accelerators are based on the AMD CDNA 3
 architecture which was designed to deliver leadership performance for HPC, artificial intelligence (AI), and machine
-learning (ML) workloads. The AMD Instinct MI300 Series GPUs are well-suited for extreme scalability and compute performance, running
+learning (ML) workloads. The AMD Instinct MI300 series accelerators are well-suited for extreme scalability and compute performance, running
 on everything from individual servers to the world’s largest exascale supercomputers.

-With the MI300 Series, AMD is introducing the Accelerator Complex Die (XCD), which contains the
+With the MI300 series, AMD is introducing the Accelerator Complex Die (XCD), which contains the
 GPU computational elements of the processor along with the lower levels of the cache hierarchy.

-The following image depicts the structure of a single XCD in the AMD Instinct MI300 GPU Series.
+The following image depicts the structure of a single XCD in the AMD Instinct MI300 accelerator series.

 ```{figure} ../../data/shared/xcd-sys-arch.png
 ---
@@ -39,7 +39,7 @@ infrastructure) using the AMD Infinity Fabric™ technology as interconnect.
 The Matrix Cores inside the CDNA 3 CUs have significant improvements, emphasizing AI and machine
 learning, enhancing throughput of existing data types while adding support for new data types.
 CDNA 2 Matrix Cores support FP16 and BF16, while offering INT8 for inference. Compared to MI250X
-GPUs, CDNA 3 Matrix Cores triple the performance for FP16 and BF16, while providing a
+accelerators, CDNA 3 Matrix Cores triple the performance for FP16 and BF16, while providing a
 performance gain of 6.8 times for INT8. FP8 has a performance gain of 16 times compared to FP32,
 while TF32 has a gain of 4 times compared to FP32.

@@ -105,7 +105,7 @@ name: mi300-arch
 alt:
 align: center
 ---
-MI300 Series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, while the MI300X (right) has 8 XCDs.
+MI300 series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, while the MI300X (right) has 8 XCDs.
 ```

 ## Node-level architecture
@@ -116,11 +116,11 @@ name: mi300-node

 align: center
 ---
-MI300 Series node-level architecture showing 8 fully interconnected MI300X OAM modules connected to (optional) PCIEe switches via retimers and HGX connectors.
+MI300 series node-level architecture showing 8 fully interconnected MI300X OAM modules connected to (optional) PCIEe switches via retimers and HGX connectors.
 ```

 The image above shows the node-level architecture of a system with AMD EPYC processors in a
-dual-socket configuration and eight AMD Instinct MI300X GPUs. The MI300X OAMs attach to the
+dual-socket configuration and eight AMD Instinct MI300X accelerators. The MI300X OAMs attach to the
 host system via PCIe Gen 5 x16 links (yellow lines). The GPUs are using seven high-bandwidth,
 low-latency AMD Infinity Fabric™ links (red lines) to form a fully connected 8-GPU system.

--- a/docs/conceptual/gpu-arch/mi350-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi350-performance-counters.rst
@@ -1,530 +0,0 @@
-.. meta::
-  :description: MI355 Series performance counters and metrics
-  :keywords: MI355, MI355X, MI3XX
-
-***********************************
-MI350 Series performance counters
-***********************************
-
-This topic lists and describes the hardware performance counters and derived metrics available on the AMD Instinct MI350 and MI355 GPUs. These counters are available for profiling using `ROCprofiler-SDK <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html>`_ and `ROCm Compute Profiler <https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/>`_.
-
-The following sections list the performance counters based on the IP blocks.
-
-Command processor packet processor counters (CPC)
-==================================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - CPC_ALWAYS_COUNT
-      - Always count.
-
-    * - CPC_ADC_VALID_CHUNK_NOT_AVAIL
-      - ADC valid chunk is not available when dispatch walking is in progress in the multi-xcc mode.
-
-    * - CPC_ADC_DISPATCH_ALLOC_DONE
-      - ADC dispatch allocation is done.
-
-    * - CPC_ADC_VALID_CHUNK_END
-      - ADC crawler's valid chunk end in the multi-xcc mode.
-
-    * - CPC_SYNC_FIFO_FULL_LEVEL
-      - SYNC FIFO full last cycles.
-
-    * - CPC_SYNC_FIFO_FULL
-      - SYNC FIFO full times.
-
-    * - CPC_GD_BUSY
-      - ADC busy.
-
-    * - CPC_TG_SEND
-      - ADC thread group send.
-
-    * - CPC_WALK_NEXT_CHUNK
-      - ADC walking next valid chunk in the multi-xcc mode.
-
-    * - CPC_STALLED_BY_SE0_SPI
-      - ADC CSDATA stalled by SE0SPI.
-
-    * - CPC_STALLED_BY_SE1_SPI
-      - ADC CSDATA stalled by SE1SPI.
-
-    * - CPC_STALLED_BY_SE2_SPI
-      - ADC CSDATA stalled by SE2SPI.
-
-    * - CPC_STALLED_BY_SE3_SPI
-      - ADC CSDATA stalled by SE3SPI.
-
-    * - CPC_LTE_ALL
-      - CPC sync counter LteAll. Only Master XCD manages LteAll.
-
-    * - CPC_SYNC_WRREQ_FIFO_BUSY
-      - CPC sync counter request FIFO is not empty.
-
-    * - CPC_CANE_BUSY
-      - CPC CANE bus is busy, which indicates the presence of inflight sync counter requests.
-
-    * - CPC_CANE_STALL
-      - CPC sync counter sending is stalled by CANE.
-
-Shader pipe interpolators (SPI) counters
-=========================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - SPI_CS0_WINDOW_VALID
-      - Clock count enabled by PIPE0 perfcounter_start event.
-
-    * - SPI_CS0_BUSY
-      - Number of clocks with outstanding waves for PIPE0 (SPI or SH).
-
-    * - SPI_CS0_NUM_THREADGROUPS
-      - Number of thread groups launched for PIPE0.
-
-    * - SPI_CS0_CRAWLER_STALL
-      - Number of clocks when PIPE0 event or wave order FIFO is full.
-
-    * - SPI_CS0_EVENT_WAVE
-      - Number of PIPE0 events and waves.
-
-    * - SPI_CS0_WAVE
-      - Number of PIPE0 waves.
-
-    * - SPI_CS1_WINDOW_VALID
-      - Clock count enabled by PIPE1 perfcounter_start event.
-
-    * - SPI_CS1_BUSY
-      - Number of clocks with outstanding waves for PIPE1 (SPI or SH).
-
-    * - SPI_CS1_NUM_THREADGROUPS
-      - Number of thread groups launched for PIPE1.
-
-    * - SPI_CS1_CRAWLER_STALL
-      - Number of clocks when PIPE1 event or wave order FIFO is full.
-
-    * - SPI_CS1_EVENT_WAVE
-      - Number of PIPE1 events and waves.
-
-    * - SPI_CS1_WAVE
-      - Number of PIPE1 waves.
-
-    * - SPI_CS2_WINDOW_VALID
-      - Clock count enabled by PIPE2 perfcounter_start event.
-
-    * - SPI_CS2_BUSY
-      - Number of clocks with outstanding waves for PIPE2 (SPI or SH).
-
-    * - SPI_CS2_NUM_THREADGROUPS
-      - Number of thread groups launched for PIPE2.
-
-    * - SPI_CS2_CRAWLER_STALL
-      - Number of clocks when PIPE2 event or wave order FIFO is full.
-
-    * - SPI_CS2_EVENT_WAVE
-      - Number of PIPE2 events and waves.
-
-    * - SPI_CS2_WAVE
-      - Number of PIPE2 waves.
-
-    * - SPI_CS3_WINDOW_VALID
-      - Clock count enabled by PIPE3 perfcounter_start event.
-
-    * - SPI_CS3_BUSY
-      - Number of clocks with outstanding waves for PIPE3 (SPI or SH).
-
-    * - SPI_CS3_NUM_THREADGROUPS
-      - Number of thread groups launched for PIPE3.
-
-    * - SPI_CS3_CRAWLER_STALL
-      - Number of clocks when PIPE3 event or wave order FIFO is full.
-
-    * - SPI_CS3_EVENT_WAVE
-      - Number of PIPE3 events and waves.
-
-    * - SPI_CS3_WAVE
-      - Number of PIPE3 waves.
-
-    * - SPI_CSQ_P0_Q0_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue0.
-
-    * - SPI_CSQ_P0_Q1_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue1.
-
-    * - SPI_CSQ_P0_Q2_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue2.
-
-    * - SPI_CSQ_P0_Q3_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue3.
-
-    * - SPI_CSQ_P0_Q4_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue4.
-
-    * - SPI_CSQ_P0_Q5_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue5.
-
-    * - SPI_CSQ_P0_Q6_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue6.
-
-    * - SPI_CSQ_P0_Q7_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue7.
-
-    * - SPI_CSQ_P1_Q0_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue0.
-
-    * - SPI_CSQ_P1_Q1_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue1.
-
-    * - SPI_CSQ_P1_Q2_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue2.
-
-    * - SPI_CSQ_P1_Q3_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue3.
-
-    * - SPI_CSQ_P1_Q4_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue4.
-
-    * - SPI_CSQ_P1_Q5_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue5.
-
-    * - SPI_CSQ_P1_Q6_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue6.
-
-    * - SPI_CSQ_P1_Q7_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue7.
-
-    * - SPI_CSQ_P2_Q0_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue0.
-
-    * - SPI_CSQ_P2_Q1_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue1.
-
-    * - SPI_CSQ_P2_Q2_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue2.
-
-    * - SPI_CSQ_P2_Q3_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue3.
-
-    * - SPI_CSQ_P2_Q4_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue4.
-
-    * - SPI_CSQ_P2_Q5_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue5.
-
-    * - SPI_CSQ_P2_Q6_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue6.
-
-    * - SPI_CSQ_P2_Q7_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue7.
-
-    * - SPI_CSQ_P3_Q0_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue0.
-
-    * - SPI_CSQ_P3_Q1_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue1.
-
-    * - SPI_CSQ_P3_Q2_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue2.
-
-    * - SPI_CSQ_P3_Q3_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue3.
-
-    * - SPI_CSQ_P3_Q4_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue4.
-
-    * - SPI_CSQ_P3_Q5_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue5.
-
-    * - SPI_CSQ_P3_Q6_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue6.
-
-    * - SPI_CSQ_P3_Q7_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue7.
-
-    * - SPI_CSQ_P0_OCCUPANCY
-      - Sum of occupancy info for all PIPE0 queues.
-
-    * - SPI_CSQ_P1_OCCUPANCY
-      - Sum of occupancy info for all PIPE1 queues.
-
-    * - SPI_CSQ_P2_OCCUPANCY
-      - Sum of occupancy info for all PIPE2 queues.
-
-    * - SPI_CSQ_P3_OCCUPANCY
-      - Sum of occupancy info for all PIPE3 queues.
-
-    * - SPI_VWC0_VDATA_VALID_WR
-      - Number of clocks VGPR bus_0 writes VGPRs.
-
-    * - SPI_VWC1_VDATA_VALID_WR
-      - Number of clocks VGPR bus_1 writes VGPRs.
-
-    * - SPI_CSC_WAVE_CNT_BUSY
-      - Number of cycles when there is any wave in the pipe.
-
-Compute unit (SQ) counters
-===========================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - SQ_INSTS_VALU_MFMA_F6F4
-      - Number of VALU V_MFMA_*_F6F4 instructions.
-
-    * - SQ_INSTS_VALU_MFMA_MOPS_F6F4
-      - Number of VALU matrix with the performed math operations (add or mul) divided by 512, assuming a full EXEC mask of F6 or F4 data type.
-
-    * - SQ_ACTIVE_INST_VALU2
-      - Number of quad-cycles when two VALU instructions are issued (per-simd, nondeterministic).
-
-    * - SQ_INSTS_LDS_LOAD
-      - Number of LDS load instructions issued (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_STORE
-      - Number of LDS store instructions issued (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_ATOMIC
-      - Number of LDS atomic instructions issued (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_LOAD_BANDWIDTH
-      - Total number of 64-bytes loaded (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_STORE_BANDWIDTH
-      - Total number of 64-bytes written (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_ATOMIC_BANDWIDTH
-      - Total number of 64-bytes atomic (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
-
-    * - SQ_INSTS_VALU_FLOPS_FP16
-      - Counts FLOPS per instruction on float 16 excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP32
-      - Counts FLOPS per instruction on float 32 excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP64
-      - Counts FLOPS per instruction on float 64 excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP16_TRANS
-      - Counts FLOPS per instruction on float 16 trans excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP32_TRANS
-      - Counts FLOPS per instruction on float 32 trans excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP64_TRANS
-      - Counts FLOPS per instruction on float 64 trans excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_IOPS
-      - Counts OPS per instruction on integer or unsigned or bit data (per-simd, emulated).
-
-    * - SQ_LDS_DATA_FIFO_FULL
-      - Number of cycles LDS data FIFO is full (nondeterministic, unwindowed).
-
-    * - SQ_LDS_CMD_FIFO_FULL
-      - Number of cycles LDS command FIFO is full (nondeterministic, unwindowed).
-
-    * - SQ_VMEM_TA_ADDR_FIFO_FULL
-      - Number of cycles texture requests are stalled due to full address FIFO in TA (nondeterministic, unwindowed).
-
-    * - SQ_VMEM_TA_CMD_FIFO_FULL
-      - Number of cycles texture requests are stalled due to full cmd FIFO in TA (nondeterministic, unwindowed).
-
-    * - SQ_VMEM_WR_TA_DATA_FIFO_FULL
-      - Number of cycles texture writes are stalled due to full data FIFO in TA (nondeterministic, unwindowed).
-
-    * - SQC_ICACHE_MISSES_DUPLICATE
-      - Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
-
-    * - SQC_DCACHE_MISSES_DUPLICATE
-      - Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
-
-Texture addressing (TA) unit counters
-======================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - TA_BUFFER_READ_LDS_WAVEFRONTS
-      - Number of buffer read wavefronts for LDS return processed by the TA.
-
-    * - TA_FLAT_READ_LDS_WAVEFRONTS
-      - Number of flat opcode reads for LDS return processed by the TA.
-
-Texture data (TD) unit counters
-================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - TD_WRITE_ACKT_WAVEFRONT
-      - Number of write acknowledgments, sent to SQ and not to SP.
-
-    * - TD_TD_SP_TRAFFIC
-      - Number of times this TD sends data to the SP.
-
-Texture cache per pipe (TCP) counters
-======================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - TCP_TCP_TA_ADDR_STALL_CYCLES
-      - TCP stalls TA addr interface.
-
-    * - TCP_TCP_TA_DATA_STALL_CYCLES
-      - TCP stalls TA data interface. Now windowed.
-
-    * - TCP_LFIFO_STALL_CYCLES
-      - Memory latency FIFOs full stall.
-
-    * - TCP_RFIFO_STALL_CYCLES
-      - Memory Request FIFOs full stall.
-
-    * - TCP_TCR_RDRET_STALL
-      - Write into cache stalled by read return from TCR.
-
-    * - TCP_PENDING_STALL_CYCLES
-      - Stall due to data pending from L2.
-
-    * - TCP_UTCL1_SERIALIZATION_STALL
-      - Total number of stalls caused due to serializing translation requests through the UTCL1.
-
-    * - TCP_UTCL1_THRASHING_STALL
-      - Stall caused by thrashing feature in any probe. Lacks accuracy when the stall signal overlaps between probe0 and probe1, which is worse with MECO of thrashing deadlock. Some probe0 events could miss being counted in with MECO on. This perf count provides a rough thrashing estimate.
-
-    * - TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS
-      - Translation miss_under_miss.
-
-    * - TCP_UTCL1_STALL_INFLIGHT_MAX
-      - Total UTCL1 stalls due to inflight counter saturation.
-
-    * - TCP_UTCL1_STALL_LRU_INFLIGHT
-      - Total UTCL1 stalls due to LRU cache line with inflight traffic.
-
-    * - TCP_UTCL1_STALL_MULTI_MISS
-      - Total UTCL1 stalls due to arbitrated multiple misses.
-
-    * - TCP_UTCL1_LFIFO_FULL
-      - Total UTCL1 and UTCL2 latency, which hides FIFO full cycles.
-
-    * - TCP_UTCL1_STALL_LFIFO_NOT_RES
-      - Total UTCL1 stalls due to UTCL2 latency, which hides FIFO output (not resident).
-
-    * - TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS
-      - Total UTCL1 stalls due to UTCL2_req being out of credits.
-
-    * - TCP_CLIENT_UTCL1_INFLIGHT
-      - The sum of inflight client to UTCL1 requests per cycle.
-
-    * - TCP_TAGRAM0_REQ
-      - Total L2 requests mapping to TagRAM 0 from this TCP to all TCCs.
-
-    * - TCP_TAGRAM1_REQ
-      - Total L2 requests mapping to TagRAM 1 from this TCP to all TCCs.
-
-    * - TCP_TAGRAM2_REQ
-      - Total L2 requests mapping to TagRAM 2 from this TCP to all TCCs.
-
-    * - TCP_TAGRAM3_REQ
-      - Total L2 requests mapping to TagRAM 3 from this TCP to all TCCs.
-
-    * - TCP_TCP_LATENCY
-      - Total TCP wave latency (from the first clock of wave entering to the first clock of wave leaving). Divide by TA_TCP_STATE_READ to find average wave latency.
-
-    * - TCP_TCC_READ_REQ_LATENCY
-      - Total TCP to TCC request latency for reads and atomics with return. Not Windowed.
-
-    * - TCP_TCC_WRITE_REQ_LATENCY
-      - Total TCP to TCC request latency for writes and atomics without return. Not Windowed.
-
-    * - TCP_TCC_WRITE_REQ_HOLE_LATENCY
-      - Total TCP req to TCC hole latency for writes and atomics. Not Windowed.
-
-Texture cache per channel (TCC) counters
-=========================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - TCC_READ_SECTORS
-      - Total number of 32B data sectors in read requests.
-
-    * - TCC_WRITE_SECTORS
-      - Total number of 32B data sectors in write requests.
-
-    * - TCC_ATOMIC_SECTORS
-      - Total number of 32B data sectors in atomic requests.
-
-    * - TCC_BYPASS_REQ
-      - Number of bypass requests. This is measured at the tag block.
-
-    * - TCC_LATENCY_FIFO_FULL
-      - Number of cycles when the latency FIFO is full.
-
-    * - TCC_SRC_FIFO_FULL
-      - Number of cycles when the SRC FIFO is assumed to be full as measured at the IB block.
-
-    * - TCC_EA0_RDREQ_64B
-      - Number of 64-byte TCC/EA read requests.
-
-    * - TCC_EA0_RDREQ_128B
-      - Number of 128-byte TCC/EA read requests.
-
-    * - TCC_IB_REQ
-      - Number of requests through the IB. This measures the number of raw requests from graphics clients to this TCC.
-
-    * - TCC_IB_STALL
-      - Number of cycles when the IB output is stalled.
-
-    * - TCC_EA0_WRREQ_WRITE_DRAM
-      - Number of TCC/EA write requests (32-byte or 64-byte) destined for DRAM (MC).
-
-    * - TCC_EA0_WRREQ_ATOMIC_DRAM
-      - Number of TCC/EA atomic requests (32-byte or 64-byte) destined for DRAM (MC).
-
-    * - TCC_EA0_RDREQ_DRAM_32B
-      - Number of 32-byte TCC/EA read requests due to DRAM traffic. One 64-byte request is counted as two and one 128-byte as four.
-
-    * - TCC_EA0_RDREQ_GMI_32B
-      - Number of 32-byte TCC/EA read requests due to GMI traffic. One 64-byte request is counted as two and one 128-byte as four.
-
-    * - TCC_EA0_RDREQ_IO_32B
-      - Number of 32-byte TCC/EA read requests due to IO traffic. One 64-byte request is counted as two and one 128-byte as four.
-
-    * - TCC_EA0_WRREQ_WRITE_DRAM_32B
-      - Number of 32-byte TCC/EA write requests due to DRAM traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_ATOMIC_DRAM_32B
-      - Number of 32-byte TCC/EA atomic requests due to DRAM traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_WRITE_GMI_32B
-      - Number of 32-byte TCC/EA write requests due to GMI traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_ATOMIC_GMI_32B
-      - Number of 32-byte TCC/EA atomic requests due to GMI traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_WRITE_IO_32B
-      - Number of 32-byte TCC/EA write requests due to IO traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_ATOMIC_IO_32B
-      - Number of 32-byte TCC/EA atomic requests due to IO traffic. One 64-byte request is counted as two.
--- a/docs/conceptual/gpu-isolation.md
+++ b/docs/conceptual/gpu-isolation.md
@@ -34,7 +34,7 @@ Runtime

 ```{code-block} shell
 :caption: Example to expose the 1. device and a device based on UUID.
-export ROCR_VISIBLE_DEVICES="0,GPU-4b2c1a9f-8d3e-6f7a-b5c9-2e4d8a1f6c3b"
+export ROCR_VISIBLE_DEVICES="0,GPU-DEADBEEFDEADBEEF"
 ```

 ### `GPU_DEVICE_ORDINAL`
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -8,7 +8,6 @@ import os
 import shutil
 import sys
 from pathlib import Path
-from subprocess import run

 gh_release_path = os.path.join("..", "RELEASE.md")
 gh_changelog_path = os.path.join("..", "CHANGELOG.md")
@@ -81,27 +80,24 @@ latex_elements = {
 }

 html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "rocm.docs.amd.com")
-html_context = {"docs_header_version": "7.1.1"}
+html_context = {}
 if os.environ.get("READTHEDOCS", "") == "True":
    html_context["READTHEDOCS"] = True

-# Check if the branch is a docs/ branch
-official_branch = run(["git", "rev-parse", "--abbrev-ref", "HEAD"], capture_output=True, text=True).stdout.find("docs/")
-
 # configurations for PDF output by Read the Docs
 project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "7.1.1"
-release = "7.1.1"
+version = "6.4.3"
+release = "6.4.3"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-11-26"},
+    {"file": "about/release-notes", "os": ["linux"], "date": "2025-08-07"},
    {"file": "release/changelog", "os": ["linux"],},
    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
@@ -114,15 +110,11 @@ article_pages = [
    {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
-    {"file": "compatibility/ml-compatibility/flashinfer-compatibility", "os": ["linux"]},
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/multi-node-setup", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/prerequisite-system-validation", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/system-health-check", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
@@ -135,9 +127,7 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
@@ -145,13 +135,9 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
@@ -176,8 +162,6 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

@@ -206,7 +190,7 @@ external_toc_path = "./sphinx/_toc.yml"
 # Add the _extensions directory to Python's search path
 sys.path.append(str(Path(__file__).parent / 'extension'))

-extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "remote-content", "version-ref", "csv-to-list-table"]
+extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "version-ref", "csv-to-list-table"]

 compatibility_matrix_file = str(Path(__file__).parent / 'compatibility/compatibility-matrix-historical-6.0.csv')

@@ -216,14 +200,10 @@ external_projects_current_project = "rocm"
 # external_projects_remote_repository = ""

 html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "https://rocm-stg.amd.com/")
-html_context = {"docs_header_version": "7.1.0"}
+html_context = {}
 if os.environ.get("READTHEDOCS", "") == "True":
    html_context["READTHEDOCS"] = True

-html_context["official_branch"] = official_branch
-html_context["version"] = version
-html_context["release"] = release
-
 html_theme = "rocm_docs_theme"
 html_theme_options = {"flavor": "rocm-docs-home"}

@@ -242,7 +222,7 @@ suppress_warnings = ["autosectionlabel.*"]

 html_context = {
    "project_path" : {project_path},
-    "gpu_type" : [('AMD Instinct GPUs', 'intrinsic'), ('AMD gfx families', 'gfx'), ('NVIDIA families', 'nvidia') ],
+    "gpu_type" : [('AMD Instinct accelerators', 'intrinsic'), ('AMD gfx families', 'gfx'), ('NVIDIA families', 'nvidia') ],
    "atomics_type" : [('HW atomics', 'hw-atomics'), ('CAS emulation', 'cas-atomics')],
    "pcie_type" : [('No PCIe atomics', 'nopcie'), ('PCIe atomics', 'pcie')],
    "memory_type" : [('Device DRAM', 'device-dram'), ('Migratable Host DRAM', 'migratable-host-dram'), ('Pinned Host DRAM', 'pinned-host-dram')],
--- a/docs/data/about/compatibility/floating-point-data-types.png
+++ b/docs/data/about/compatibility/floating-point-data-types.png
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
@@ -1,188 +0,0 @@
-dockers:
-  - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
-    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
-    components:
-      ROCm: 6.4.1
-      vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
-      PyTorch: 2.7.0+gitf717b2a
-      hipBLASLt: 0.15
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_vllm_llama-3.1-8b
-      model_repo: meta-llama/Llama-3.1-8B-Instruct
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: float16
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 70B
-      mad_tag: pyt_vllm_llama-3.1-70b
-      model_repo: meta-llama/Llama-3.1-70B-Instruct
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 405B
-      mad_tag: pyt_vllm_llama-3.1-405b
-      model_repo: meta-llama/Llama-3.1-405B-Instruct
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 2 70B
-      mad_tag: pyt_vllm_llama-2-70b
-      model_repo: meta-llama/Llama-2-70b-chat-hf
-      url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 4096
-        max_num_batched_tokens: 4096
-        max_model_len: 4096
-    - model: Llama 3.1 8B FP8
-      mad_tag: pyt_vllm_llama-3.1-8b_fp8
-      model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-      url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-      precision: float8
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 70B FP8
-      mad_tag: pyt_vllm_llama-3.1-70b_fp8
-      model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-      url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 405B FP8
-      mad_tag: pyt_vllm_llama-3.1-405b_fp8
-      model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-      url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-  - group: Mistral AI
-    tag: mistral
-    models:
-    - model: Mixtral MoE 8x7B
-      mad_tag: pyt_vllm_mixtral-8x7b
-      model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-      url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 32768
-        max_num_batched_tokens: 32768
-        max_model_len: 8192
-    - model: Mixtral MoE 8x22B
-      mad_tag: pyt_vllm_mixtral-8x22b
-      model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-      url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 65536
-        max_num_batched_tokens: 65536
-        max_model_len: 8192
-    - model: Mixtral MoE 8x7B FP8
-      mad_tag: pyt_vllm_mixtral-8x7b_fp8
-      model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-      url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 32768
-        max_num_batched_tokens: 32768
-        max_model_len: 8192
-    - model: Mixtral MoE 8x22B FP8
-      mad_tag: pyt_vllm_mixtral-8x22b_fp8
-      model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-      url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 65536
-        max_num_batched_tokens: 65536
-        max_model_len: 8192
-  - group: Qwen
-    tag: qwen
-    models:
-    - model: QwQ-32B
-      mad_tag: pyt_vllm_qwq-32b
-      model_repo: Qwen/QwQ-32B
-      url: https://huggingface.co/Qwen/QwQ-32B
-      precision: float16
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Qwen3 30B A3B
-      mad_tag: pyt_vllm_qwen3-30b-a3b
-      model_repo: Qwen/Qwen3-30B-A3B
-      url: https://huggingface.co/Qwen/Qwen3-30B-A3B
-      precision: float16
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 32768
-        max_num_batched_tokens: 32768
-        max_model_len: 8192
-  - group: Microsoft Phi
-    tag: phi
-    models:
-    - model: Phi-4
-      mad_tag: pyt_vllm_phi-4
-      model_repo: microsoft/phi-4
-      url: https://huggingface.co/microsoft/phi-4
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 16384
-        max_num_batched_tokens: 16384
-        max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
@@ -1,316 +0,0 @@
-dockers:
-  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
-    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
-    components:
-      ROCm: 7.0.0
-      vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
-      PyTorch: 2.9.0a0+git1c57644
-      hipBLASLt: 1.0.0
-    dockerfile:
-      commit: 790d22168820507f3105fef29596549378cfe399
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 4096
-          max_model_len: 4096
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B MXFP4
-        mad_tag: pyt_vllm_llama-3.1-405b_fp4
-        model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
-        precision: float4
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B
-        mad_tag: pyt_vllm_llama-3.3-70b
-        model_repo: meta-llama/Llama-3.3-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B FP8
-        mad_tag: pyt_vllm_llama-3.3-70b_fp8
-        model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B MXFP4
-        mad_tag: pyt_vllm_llama-3.3-70b_fp4
-        model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-        precision: float4
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 4 Scout 17Bx16E
-        mad_tag: pyt_vllm_llama-4-scout-17b-16e
-        model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
-        url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Llama 4 Maverick 17Bx128E
-        mad_tag: pyt_vllm_llama-4-maverick-17b-128e
-        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Llama 4 Maverick 17Bx128E FP8
-        mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
-        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek R1 0528 FP8
-        mad_tag: pyt_vllm_deepseek-r1
-        model_repo: deepseek-ai/DeepSeek-R1-0528
-        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_seqs: 1024
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-  - group: OpenAI GPT OSS
-    tag: gpt-oss
-    models:
-      - model: GPT OSS 20B
-        mad_tag: pyt_vllm_gpt-oss-20b
-        model_repo: openai/gpt-oss-20b
-        url: https://huggingface.co/openai/gpt-oss-20b
-        precision: bfloat16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 8192
-          max_model_len: 8192
-      - model: GPT OSS 120B
-        mad_tag: pyt_vllm_gpt-oss-120b
-        model_repo: openai/gpt-oss-120b
-        url: https://huggingface.co/openai/gpt-oss-120b
-        precision: bfloat16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 8192
-          max_model_len: 8192
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 65536
-          max_model_len: 8192
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 65536
-          max_model_len: 8192
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen3 8B
-        mad_tag: pyt_vllm_qwen3-8b
-        model_repo: Qwen/Qwen3-8B
-        url: https://huggingface.co/Qwen/Qwen3-8B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 32B
-        mad_tag: pyt_vllm_qwen3-32b
-        model_repo: Qwen/Qwen3-32b
-        url: https://huggingface.co/Qwen/Qwen3-32B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 30B A3B
-        mad_tag: pyt_vllm_qwen3-30b-a3b
-        model_repo: Qwen/Qwen3-30B-A3B
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 30B A3B FP8
-        mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
-        model_repo: Qwen/Qwen3-30B-A3B-FP8
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 235B A22B
-        mad_tag: pyt_vllm_qwen3-235b-a22b
-        model_repo: Qwen/Qwen3-235B-A22B
-        url: https://huggingface.co/Qwen/Qwen3-235B-A22B
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 235B A22B FP8
-        mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
-        model_repo: Qwen/Qwen3-235B-A22B-FP8
-        url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-  - group: Microsoft Phi
-    tag: phi
-    models:
-      - model: Phi-4
-        mad_tag: pyt_vllm_phi-4
-        model_repo: microsoft/phi-4
-        url: https://huggingface.co/microsoft/phi-4
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 16384
-          max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
@@ -1,32 +0,0 @@
-dockers:
-  - pull_tag: lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
-    docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.5.2rc1-rocm700-mi30x/images/sha256-10c4ee502ddba44dd8c13325e6e03868bfe7f43d23d0a44780a8ee8b393f4729
-    components:
-      ROCm: 7.0.0
-      SGLang: v0.5.2rc1
-      pytorch-triton-rocm: 3.4.0+rocm7.0.0.gitf9e5bf54
-model_groups:
-  - group: Dense models
-    tag: dense-models
-    models:
-      - model: Llama 3.1 8B Instruct
-        model_repo: Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
-      - model: Llama 3.1 405B FP8 KV
-        model_repo: Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-      - model: Llama 3.3 70B FP8 KV
-        model_repo: amd-Llama-3.3-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
-      - model: Qwen3 32B
-        model_repo: Qwen3-32B
-        url: https://huggingface.co/Qwen/Qwen3-32B
-  - group: Small experts models
-    tag: small-experts-models
-    models:
-      - model: DeepSeek V3
-        model_repo: DeepSeek-V3
-        url: https://huggingface.co/deepseek-ai/DeepSeek-V3
-      - model: Mixtral 8x7B v0.1
-        model_repo: Mixtral-8x7B-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,316 +1,188 @@
 dockers:
-  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
-    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506
+  - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
    components:
-      ROCm: 7.0.0
-      vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700)
-      PyTorch: 2.9.0a0+git1c57644
-      hipBLASLt: 1.0.0
-    dockerfile:
-      commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72
+      ROCm: 6.4.1
+      vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
+      PyTorch: 2.7.0+gitf717b2a
+      hipBLASLt: 0.15
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 4096
-          max_model_len: 4096
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B MXFP4
-        mad_tag: pyt_vllm_llama-3.1-405b_fp4
-        model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
-        precision: float4
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B
-        mad_tag: pyt_vllm_llama-3.3-70b
-        model_repo: meta-llama/Llama-3.3-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B FP8
-        mad_tag: pyt_vllm_llama-3.3-70b_fp8
-        model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B MXFP4
-        mad_tag: pyt_vllm_llama-3.3-70b_fp4
-        model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-        precision: float4
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 4 Scout 17Bx16E
-        mad_tag: pyt_vllm_llama-4-scout-17b-16e
-        model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
-        url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Llama 4 Maverick 17Bx128E
-        mad_tag: pyt_vllm_llama-4-maverick-17b-128e
-        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Llama 4 Maverick 17Bx128E FP8
-        mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
-        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek R1 0528 FP8
-        mad_tag: pyt_vllm_deepseek-r1
-        model_repo: deepseek-ai/DeepSeek-R1-0528
-        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_seqs: 1024
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-  - group: OpenAI GPT OSS
-    tag: gpt-oss
-    models:
-      - model: GPT OSS 20B
-        mad_tag: pyt_vllm_gpt-oss-20b
-        model_repo: openai/gpt-oss-20b
-        url: https://huggingface.co/openai/gpt-oss-20b
-        precision: bfloat16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 8192
-          max_model_len: 8192
-      - model: GPT OSS 120B
-        mad_tag: pyt_vllm_gpt-oss-120b
-        model_repo: openai/gpt-oss-120b
-        url: https://huggingface.co/openai/gpt-oss-120b
-        precision: bfloat16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 8192
-          max_model_len: 8192
+    - model: Llama 3.1 8B
+      mad_tag: pyt_vllm_llama-3.1-8b
+      model_repo: meta-llama/Llama-3.1-8B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 70B
+      mad_tag: pyt_vllm_llama-3.1-70b
+      model_repo: meta-llama/Llama-3.1-70B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 405B
+      mad_tag: pyt_vllm_llama-3.1-405b
+      model_repo: meta-llama/Llama-3.1-405B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 2 70B
+      mad_tag: pyt_vllm_llama-2-70b
+      model_repo: meta-llama/Llama-2-70b-chat-hf
+      url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 4096
+        max_num_batched_tokens: 4096
+        max_model_len: 4096
+    - model: Llama 3.1 8B FP8
+      mad_tag: pyt_vllm_llama-3.1-8b_fp8
+      model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 70B FP8
+      mad_tag: pyt_vllm_llama-3.1-70b_fp8
+      model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 405B FP8
+      mad_tag: pyt_vllm_llama-3.1-405b_fp8
+      model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
  - group: Mistral AI
    tag: mistral
    models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 65536
-          max_model_len: 8192
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 65536
-          max_model_len: 8192
+    - model: Mixtral MoE 8x7B
+      mad_tag: pyt_vllm_mixtral-8x7b
+      model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+      url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+    - model: Mixtral MoE 8x22B
+      mad_tag: pyt_vllm_mixtral-8x22b
+      model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+      url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
+    - model: Mixtral MoE 8x7B FP8
+      mad_tag: pyt_vllm_mixtral-8x7b_fp8
+      model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+      url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+    - model: Mixtral MoE 8x22B FP8
+      mad_tag: pyt_vllm_mixtral-8x22b_fp8
+      model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+      url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
  - group: Qwen
    tag: qwen
    models:
-      - model: Qwen3 8B
-        mad_tag: pyt_vllm_qwen3-8b
-        model_repo: Qwen/Qwen3-8B
-        url: https://huggingface.co/Qwen/Qwen3-8B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 32B
-        mad_tag: pyt_vllm_qwen3-32b
-        model_repo: Qwen/Qwen3-32b
-        url: https://huggingface.co/Qwen/Qwen3-32B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 30B A3B
-        mad_tag: pyt_vllm_qwen3-30b-a3b
-        model_repo: Qwen/Qwen3-30B-A3B
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 30B A3B FP8
-        mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
-        model_repo: Qwen/Qwen3-30B-A3B-FP8
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 235B A22B
-        mad_tag: pyt_vllm_qwen3-235b-a22b
-        model_repo: Qwen/Qwen3-235B-A22B
-        url: https://huggingface.co/Qwen/Qwen3-235B-A22B
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 235B A22B FP8
-        mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
-        model_repo: Qwen/Qwen3-235B-A22B-FP8
-        url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
+    - model: QwQ-32B
+      mad_tag: pyt_vllm_qwq-32b
+      model_repo: Qwen/QwQ-32B
+      url: https://huggingface.co/Qwen/QwQ-32B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Qwen3 30B A3B
+      mad_tag: pyt_vllm_qwen3-30b-a3b
+      model_repo: Qwen/Qwen3-30B-A3B
+      url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
  - group: Microsoft Phi
    tag: phi
    models:
-      - model: Phi-4
-        mad_tag: pyt_vllm_phi-4
-        model_repo: microsoft/phi-4
-        url: https://huggingface.co/microsoft/phi-4
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 16384
-          max_model_len: 8192
+    - model: Phi-4
+      mad_tag: pyt_vllm_phi-4
+      model_repo: microsoft/phi-4
+      url: https://huggingface.co/microsoft/phi-4
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 16384
+        max_num_batched_tokens: 16384
+        max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -1,16 +1,47 @@
 dockers:
-  - pull_tag: rocm/jax-training:maxtext-v25.9
+  - pull_tag: rocm/jax-training:maxtext-v25.7
    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
    components:
-      ROCm: 7.0.0
-      JAX: 0.6.2
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+c91bac54
+      ROCm: 6.4.1
+      JAX: 0.5.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
      hipBLASLt: 1.x.x
+  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.6.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.1.0-499ece1c21
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
+      - model: Llama 3.3 70B
+        mad_tag: jax_maxtext_train_llama-3.3-70b
+        model_repo: Llama-3.3-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 8B
+        mad_tag: jax_maxtext_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 70B
+        mad_tag: jax_maxtext_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3 8B
+        mad_tag: jax_maxtext_train_llama-3-8b
+        multinode_training_script: llama3_8b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3 70B
+        mad_tag: jax_maxtext_train_llama-3-70b
+        multinode_training_script: llama3_70b_multinode.sh
+        doc_options: ["multi-node"]
      - model: Llama 2 7B
        mad_tag: jax_maxtext_train_llama-2-7b
        model_repo: Llama-2-7B
@@ -23,29 +54,6 @@ model_groups:
        precision: bf16
        multinode_training_script: llama2_70b_multinode.sh
        doc_options: ["single-node", "multi-node"]
-      - model: Llama 3 8B (multi-node)
-        mad_tag: jax_maxtext_train_llama-3-8b
-        multinode_training_script: llama3_8b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3 70B (multi-node)
-        mad_tag: jax_maxtext_train_llama-3-70b
-        multinode_training_script: llama3_70b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3.1 8B
-        mad_tag: jax_maxtext_train_llama-3.1-8b
-        model_repo: Llama-3.1-8B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 70B
-        mad_tag: jax_maxtext_train_llama-3.1-70b
-        model_repo: Llama-3.1-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.3 70B
-        mad_tag: jax_maxtext_train_llama-3.3-70b
-        model_repo: Llama-3.3-70B
-        precision: bf16
-        doc_options: ["single-node"]
  - group: DeepSeek
    tag: deepseek
    models:
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,21 +1,15 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
+    components:
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
+      Triton: 3.3.0
+      RCCL: 2.22.3
 model_groups:
  - group: Meta Llama
    tag: llama
@@ -26,6 +20,8 @@ model_groups:
        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
      - model: Llama 3.1 70B
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 3.1 70B (proxy)
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
      - model: Llama 2 7B
        mad_tag: pyt_megatron_lm_train_llama-2-7b
      - model: Llama 2 70B
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
@@ -1,72 +0,0 @@
-dockers:
-  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
-    components:
-      ROCm: 6.4.1
-      JAX: 0.6.0
-      Python: 3.10.12
-      Transformer Engine: 2.1.0+90d703dd
-      hipBLASLt: 1.1.0-499ece1c21
-  - pull_tag: rocm/jax-training:maxtext-v25.7
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
-    components:
-      ROCm: 6.4.1
-      JAX: 0.5.0
-      Python: 3.10.12
-      Transformer Engine: 2.1.0+90d703dd
-      hipBLASLt: 1.x.x
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: jax_maxtext_train_llama-3.3-70b
-        model_repo: Llama-3.3-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 8B
-        mad_tag: jax_maxtext_train_llama-3.1-8b
-        model_repo: Llama-3.1-8B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 70B
-        mad_tag: jax_maxtext_train_llama-3.1-70b
-        model_repo: Llama-3.1-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3 8B
-        mad_tag: jax_maxtext_train_llama-3-8b
-        multinode_training_script: llama3_8b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3 70B
-        mad_tag: jax_maxtext_train_llama-3-70b
-        multinode_training_script: llama3_70b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 2 7B
-        mad_tag: jax_maxtext_train_llama-2-7b
-        model_repo: Llama-2-7B
-        precision: bf16
-        multinode_training_script: llama2_7b_multinode.sh
-        doc_options: ["single-node", "multi-node"]
-      - model: Llama 2 70B
-        mad_tag: jax_maxtext_train_llama-2-70b
-        model_repo: Llama-2-70B
-        precision: bf16
-        multinode_training_script: llama2_70b_multinode.sh
-        doc_options: ["single-node", "multi-node"]
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V2-Lite (16B)
-        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
-        model_repo: DeepSeek-V2-lite
-        precision: bf16
-        doc_options: ["single-node"]
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: jax_maxtext_train_mixtral-8x7b
-        model_repo: Mixtral-8x7B
-        precision: bf16
-        doc_options: ["single-node"]
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.7-benchmark-models.yaml
@@ -1,49 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.7_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
-    components:
-      ROCm: 6.4.2
-      Primus: v0.1.0-rc1
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+ba586519
-      hipBLASLt: 37ba1d36
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
-      - model: Llama 3.1 8B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
-      - model: Llama 3.1 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
-      - model: Llama 3.1 70B (proxy)
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
-      - model: Llama 2 7B
-        mad_tag: pyt_megatron_lm_train_llama-2-7b
-      - model: Llama 2 70B
-        mad_tag: pyt_megatron_lm_train_llama-2-70b
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
-      - model: DeepSeek-V2-Lite
-        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
-      - model: Qwen 2.5 72B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.8-benchmark-models.yaml
@@ -1,48 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.8_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      hipBLASLt: d1b517fc7a
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
-      - model: Llama 3.1 8B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
-      - model: Llama 3.1 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
-      - model: Llama 3.1 70B (proxy)
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
-      - model: Llama 2 7B
-        mad_tag: pyt_megatron_lm_train_llama-2-7b
-      - model: Llama 2 70B
-        mad_tag: pyt_megatron_lm_train_llama-2-70b
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
-      - model: DeepSeek-V2-Lite
-        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
-      - model: Qwen 2.5 72B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
@@ -1,58 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.7_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
-    components:
-      ROCm: 6.4.2
-      Primus: v0.1.0-rc1
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+ba586519
-      hipBLASLt: 37ba1d36
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
-        config_name: llama3.3_70B-pretrain.yaml
-      - model: Llama 3.1 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
-        config_name: llama3.1_70B-pretrain.yaml
-      - model: Llama 3.1 8B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
-        config_name: llama3.1_8B-pretrain.yaml
-      - model: Llama 2 7B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
-        config_name: llama2_7B-pretrain.yaml
-      - model: Llama 2 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
-        config_name: llama2_70B-pretrain.yaml
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
-        config_name: deepseek_v3-pretrain.yaml
-      - model: DeepSeek-V2-Lite
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
-        config_name: deepseek_v2_lite-pretrain.yaml
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
-        config_name: mixtral_8x7B_v0.1-pretrain.yaml
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
-        config_name: mixtral_8x22B_v0.1-pretrain.yaml
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
-        config_name: primus_qwen2.5_7B-pretrain.yaml
-      - model: Qwen 2.5 72B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
-        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
@@ -1,58 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.8_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
-    components:
-      ROCm: 6.4.3
-      Primus: 927a717
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      hipBLASLt: d1b517fc7a
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
-        config_name: llama3.3_70B-pretrain.yaml
-      - model: Llama 3.1 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
-        config_name: llama3.1_70B-pretrain.yaml
-      - model: Llama 3.1 8B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
-        config_name: llama3.1_8B-pretrain.yaml
-      - model: Llama 2 7B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
-        config_name: llama2_7B-pretrain.yaml
-      - model: Llama 2 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
-        config_name: llama2_70B-pretrain.yaml
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
-        config_name: deepseek_v3-pretrain.yaml
-      - model: DeepSeek-V2-Lite
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
-        config_name: deepseek_v2_lite-pretrain.yaml
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
-        config_name: mixtral_8x7B_v0.1-pretrain.yaml
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
-        config_name: mixtral_8x22B_v0.1-pretrain.yaml
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
-        config_name: primus_qwen2.5_7B-pretrain.yaml
-      - model: Qwen 2.5 72B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
-        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.8-benchmark-models.yaml
@@ -1,24 +0,0 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.8
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+a1e66aae
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-d1b517fc7a
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: primus_pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-    - model: Llama 3.1 70B
-      mad_tag: primus_pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
-      precision: BF16
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml
@@ -1,162 +0,0 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.7
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
-    components:
-      ROCm: 6.4.2
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+94e53dd8
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-4b9a52edfc
-      Triton: 3.3.0
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 4 Scout 17B-16E
-      mad_tag: pyt_train_llama-4-scout-17b-16e
-      model_repo: Llama-4-17B_16E
-      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.3 70B
-      mad_tag: pyt_train_llama-3.3-70b
-      model_repo: Llama-3.3-70B
-      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 3.2 1B
-      mad_tag: pyt_train_llama-3.2-1b
-      model_repo: Llama-3.2-1B
-      url: https://huggingface.co/meta-llama/Llama-3.2-1B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 3B
-      mad_tag: pyt_train_llama-3.2-3b
-      model_repo: Llama-3.2-3B
-      url: https://huggingface.co/meta-llama/Llama-3.2-3B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 Vision 11B
-      mad_tag: pyt_train_llama-3.2-vision-11b
-      model_repo: Llama-3.2-Vision-11B
-      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.2 Vision 90B
-      mad_tag: pyt_train_llama-3.2-vision-90b
-      model_repo: Llama-3.2-Vision-90B
-      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora]
-    - model: Llama 3.1 405B
-      mad_tag: pyt_train_llama-3.1-405b
-      model_repo: Llama-3.1-405B
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B
-      precision: BF16
-      training_modes: [finetune_qlora]
-    - model: Llama 3 8B
-      mad_tag: pyt_train_llama-3-8b
-      model_repo: Llama-3-8B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3 70B
-      mad_tag: pyt_train_llama-3-70b
-      model_repo: Llama-3-70B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 7B
-      mad_tag: pyt_train_llama-2-7b
-      model_repo: Llama-2-7B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 2 13B
-      mad_tag: pyt_train_llama-2-13b
-      model_repo: Llama-2-13B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 70B
-      mad_tag: pyt_train_llama-2-70b
-      model_repo: Llama-2-70B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora]
-  - group: OpenAI
-    tag: openai
-    models:
-    - model: GPT OSS 20B
-      mad_tag: pyt_train_gpt_oss_20b
-      model_repo: GPT-OSS-20B
-      url: https://huggingface.co/openai/gpt-oss-20b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-    - model: GPT OSS 120B
-      mad_tag: pyt_train_gpt_oss_120b
-      model_repo: GPT-OSS-120B
-      url: https://huggingface.co/openai/gpt-oss-120b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-  - group: Qwen
-    tag: qwen
-    models:
-    - model: Qwen 3 8B
-      mad_tag: pyt_train_qwen3-8b
-      model_repo: Qwen3-8B
-      url: https://huggingface.co/Qwen/Qwen3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 3 32B
-      mad_tag: pyt_train_qwen3-32b
-      model_repo: Qwen3-32
-      url: https://huggingface.co/Qwen/Qwen3-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 32B
-      mad_tag: pyt_train_qwen2.5-32b
-      model_repo: Qwen2.5-32B
-      url: https://huggingface.co/Qwen/Qwen2.5-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 72B
-      mad_tag: pyt_train_qwen2.5-72b
-      model_repo: Qwen2.5-72B
-      url: https://huggingface.co/Qwen/Qwen2.5-72B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2 1.5B
-      mad_tag: pyt_train_qwen2-1.5b
-      model_repo: Qwen2-1.5B
-      url: https://huggingface.co/Qwen/Qwen2-1.5B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 2 7B
-      mad_tag: pyt_train_qwen2-7b
-      model_repo: Qwen2-7B
-      url: https://huggingface.co/Qwen/Qwen2-7B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-  - group: Flux
-    tag: flux
-    models:
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.8-benchmark-models.yaml
@@ -1,178 +0,0 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.8
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+a1e66aae
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-d1b517fc7a
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 4 Scout 17B-16E
-      mad_tag: pyt_train_llama-4-scout-17b-16e
-      model_repo: Llama-4-17B_16E
-      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.3 70B
-      mad_tag: pyt_train_llama-3.3-70b
-      model_repo: Llama-3.3-70B
-      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 3.2 1B
-      mad_tag: pyt_train_llama-3.2-1b
-      model_repo: Llama-3.2-1B
-      url: https://huggingface.co/meta-llama/Llama-3.2-1B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 3B
-      mad_tag: pyt_train_llama-3.2-3b
-      model_repo: Llama-3.2-3B
-      url: https://huggingface.co/meta-llama/Llama-3.2-3B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 Vision 11B
-      mad_tag: pyt_train_llama-3.2-vision-11b
-      model_repo: Llama-3.2-Vision-11B
-      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.2 Vision 90B
-      mad_tag: pyt_train_llama-3.2-vision-90b
-      model_repo: Llama-3.2-Vision-90B
-      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora]
-    - model: Llama 3.1 405B
-      mad_tag: pyt_train_llama-3.1-405b
-      model_repo: Llama-3.1-405B
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B
-      precision: BF16
-      training_modes: [finetune_qlora]
-    - model: Llama 3 8B
-      mad_tag: pyt_train_llama-3-8b
-      model_repo: Llama-3-8B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3 70B
-      mad_tag: pyt_train_llama-3-70b
-      model_repo: Llama-3-70B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 7B
-      mad_tag: pyt_train_llama-2-7b
-      model_repo: Llama-2-7B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 2 13B
-      mad_tag: pyt_train_llama-2-13b
-      model_repo: Llama-2-13B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 70B
-      mad_tag: pyt_train_llama-2-70b
-      model_repo: Llama-2-70B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora]
-  - group: OpenAI
-    tag: openai
-    models:
-    - model: GPT OSS 20B
-      mad_tag: pyt_train_gpt_oss_20b
-      model_repo: GPT-OSS-20B
-      url: https://huggingface.co/openai/gpt-oss-20b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-    - model: GPT OSS 120B
-      mad_tag: pyt_train_gpt_oss_120b
-      model_repo: GPT-OSS-120B
-      url: https://huggingface.co/openai/gpt-oss-120b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-  - group: Qwen
-    tag: qwen
-    models:
-    - model: Qwen 3 8B
-      mad_tag: pyt_train_qwen3-8b
-      model_repo: Qwen3-8B
-      url: https://huggingface.co/Qwen/Qwen3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 3 32B
-      mad_tag: pyt_train_qwen3-32b
-      model_repo: Qwen3-32
-      url: https://huggingface.co/Qwen/Qwen3-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 32B
-      mad_tag: pyt_train_qwen2.5-32b
-      model_repo: Qwen2.5-32B
-      url: https://huggingface.co/Qwen/Qwen2.5-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 72B
-      mad_tag: pyt_train_qwen2.5-72b
-      model_repo: Qwen2.5-72B
-      url: https://huggingface.co/Qwen/Qwen2.5-72B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2 1.5B
-      mad_tag: pyt_train_qwen2-1.5b
-      model_repo: Qwen2-1.5B
-      url: https://huggingface.co/Qwen/Qwen2-1.5B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 2 7B
-      mad_tag: pyt_train_qwen2-7b
-      model_repo: Qwen2-7B
-      url: https://huggingface.co/Qwen/Qwen2-7B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-  - group: Stable Diffusion
-    tag: sd
-    models:
-    - model: Stable Diffusion XL
-      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
-      model_repo: SDXL
-      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
-      precision: BF16
-      training_modes: [finetune_lora]
-  - group: Flux
-    tag: flux
-    models:
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: NCF
-    tag: ncf
-    models:
-    - model: NCF
-      mad_tag: pyt_ncf_training
-      model_repo:
-      url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
-      precision: FP32
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -1,22 +1,15 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
+    components:
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
+      Triton: 3.3.0
+      RCCL: 2.22.3
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -1,39 +0,0 @@
-dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: primus_pyt_train_llama-3.1-8b
-      model_repo: meta-llama/Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      config_file:
-        bf16: "./llama3_8b_fsdp_bf16.toml"
-        fp8: "./llama3_8b_fsdp_fp8.toml"
-    - model: Llama 3.1 70B
-      mad_tag: primus_pyt_train_llama-3.1-70b
-      model_repo: meta-llama/Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
-      precision: BF16
-      config_file:
-        bf16: "./llama3_70b_fsdp_bf16.toml"
-        fp8: "./llama3_70b_fsdp_fp8.toml"
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,21 +1,14 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+  - pull_tag: rocm/pytorch-training:v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
+    components:
+      ROCm: 6.4.2
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+94e53dd8
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-4b9a52edfc
+      Triton: 3.3.0
 model_groups:
  - group: Meta Llama
    tag: llama
@@ -158,15 +151,6 @@ model_groups:
      url: https://huggingface.co/Qwen/Qwen2-7B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
-  - group: Stable Diffusion
-    tag: sd
-    models:
-    - model: Stable Diffusion XL
-      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
-      model_repo: SDXL
-      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
-      precision: BF16
-      training_modes: [posttrain-p]
  - group: Flux
    tag: flux
    models:
@@ -175,12 +159,4 @@ model_groups:
      model_repo: Flux
      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
      precision: BF16
-      training_modes: [posttrain-p]
-  - group: NCF
-    tag: ncf
-    models:
-    - model: NCF
-      mad_tag: pyt_ncf_training
-      model_repo:
-      url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
-      precision: FP32
+      training_modes: [pretrain]
--- a/docs/data/reference/gpu-atomics-operation/cas-atomics_nopcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/cas-atomics_nopcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X Series,MI300A,MI350X Series
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native,⚠️ Scope Downgrade - CAS
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native,⚠️ Scope Downgrade - CAS
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native,⚠️ Scope Downgrade - CAS
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
--- a/docs/data/reference/gpu-atomics-operation/cas-atomics_pcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/cas-atomics_pcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X Series,MI300A,MI350X Series
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
--- a/docs/data/reference/gpu-atomics-operation/hw-atomics_nopcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/hw-atomics_nopcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X Series,MI300A,MI350X Series
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
--- a/docs/data/reference/gpu-atomics-operation/hw-atomics_pcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/hw-atomics_pcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X Series,MI300A,MI350X Series
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
--- a/docs/data/reference/precision-support/precision-support.yaml
+++ b/docs/data/reference/precision-support/precision-support.yaml
@@ -1,391 +0,0 @@
-# rocm-library-support.yaml
-library_groups:
-  - group: "ML & Computer Vision"
-    tag: "ml-cv"
-    libraries:
-      - name: "Composable Kernel"
-        tag: "composable-kernel"
-        doc_link: "composable_kernel:reference/Composable_Kernel_supported_scalar_types"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "float4"
-            support: "✅"
-          - type: "float6 (E2M3)"
-            support: "✅"
-          - type: "float6 (E3M2)"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "MIGraphX"
-        tag: "migraphx"
-        doc_link: "amdmigraphx:reference/cpp"
-        data_types:
-          - type: "int8"
-            support: "⚠️"
-          - type: "int16"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "MIOpen"
-        tag: "miopen"
-        doc_link: "miopen:reference/datatypes"
-        data_types:
-          - type: "int8"
-            support: "⚠️"
-          - type: "int32"
-            support: "⚠️"
-          - type: "float8 (E4M3)"
-            support: "⚠️"
-          - type: "float8 (E5M2)"
-            support: "⚠️"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "⚠️"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "⚠️"
-
-  - group: "Communication"
-    tag: "communication"
-    libraries:
-      - name: "RCCL"
-        tag: "rccl"
-        doc_link: "rccl:api-reference/library-specification"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-  - group: "Math Libraries"
-    tag: "math-libs"
-    libraries:
-      - name: "hipBLAS"
-        tag: "hipblas"
-        doc_link: "hipblas:reference/data-type-support"
-        data_types:
-          - type: "float16"
-            support: "⚠️"
-          - type: "bfloat16"
-            support: "⚠️"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipBLASLt"
-        tag: "hipblaslt"
-        doc_link: "hipblaslt:reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "float4"
-            support: "✅"
-          - type: "float6 (E2M3)"
-            support: "✅"
-          - type: "float6 (E3M2)"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-
-      - name: "hipFFT"
-        tag: "hipfft"
-        doc_link: "hipfft:reference/fft-api-usage"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipRAND"
-        tag: "hiprand"
-        doc_link: "hiprand:api-reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "Output only"
-          - type: "int16"
-            support: "Output only"
-          - type: "int32"
-            support: "Output only"
-          - type: "int64"
-            support: "Output only"
-          - type: "float16"
-            support: "Output only"
-          - type: "float32"
-            support: "Output only"
-          - type: "float64"
-            support: "Output only"
-
-      - name: "hipSOLVER"
-        tag: "hipsolver"
-        doc_link: "hipsolver:reference/precision"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipSPARSE"
-        tag: "hipsparse"
-        doc_link: "hipsparse:reference/precision"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipSPARSELt"
-        tag: "hipsparselt"
-        doc_link: "hipsparselt:reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-
-      - name: "rocBLAS"
-        tag: "rocblas"
-        doc_link: "rocblas:reference/data-type-support"
-        data_types:
-          - type: "float16"
-            support: "⚠️"
-          - type: "bfloat16"
-            support: "⚠️"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocFFT"
-        tag: "rocfft"
-        doc_link: "rocfft:reference/api"
-        data_types:
-          - type: "float16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocRAND"
-        tag: "rocrand"
-        doc_link: "rocrand:api-reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "Output only"
-          - type: "int16"
-            support: "Output only"
-          - type: "int32"
-            support: "Output only"
-          - type: "int64"
-            support: "Output only"
-          - type: "float16"
-            support: "Output only"
-          - type: "float32"
-            support: "Output only"
-          - type: "float64"
-            support: "Output only"
-
-      - name: "rocSOLVER"
-        tag: "rocsolver"
-        doc_link: "rocsolver:reference/precision"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocSPARSE"
-        tag: "rocsparse"
-        doc_link: "rocsparse:reference/precision"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocWMMA"
-        tag: "rocwmma"
-        doc_link: "rocwmma:api-reference/api-reference-guide"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int32"
-            support: "Output only"
-          - type: "float8 (E4M3)"
-            support: "Input only"
-          - type: "float8 (E5M2)"
-            support: "Input only"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "tensorfloat32"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "Tensile"
-        tag: "tensile"
-        doc_link: "tensile:reference/precision-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "tensorfloat32"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-  - group: "Primitives"
-    tag: "primitives"
-    libraries:
-      - name: "hipCUB"
-        tag: "hipcub"
-        doc_link: "hipcub:api-reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int16"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipTensor"
-        tag: "hiptensor"
-        doc_link: "hiptensor:api-reference/api-reference"
-        data_types:
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocPRIM"
-        tag: "rocprim"
-        doc_link: "rocprim:reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int16"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocThrust"
-        tag: "rocthrust"
-        doc_link: "rocthrust:data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int16"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float16"
-            support: "⚠️"
-          - type: "bfloat16"
-            support: "⚠️"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
--- a/docs/data/rocm-software-stack-7_0_0.jpg
+++ b/docs/data/rocm-software-stack-7_0_0.jpg
--- a/docs/extension/remote-content.py
+++ b/docs/extension/remote-content.py
@@ -1,141 +0,0 @@
-from docutils import nodes
-from docutils.parsers.rst import Directive
-from docutils.statemachine import ViewList
-from sphinx.util import logging
-from sphinx.util.nodes import nested_parse_with_titles
-import requests
-import re
-
-logger = logging.getLogger(__name__)
-
-class BranchAwareRemoteContent(Directive):
-    """
-    Directive that downloads and includes content from other repositories,
-    matching the branch/tag of the current documentation build.
-
-    Usage:
-    .. remote-content::
-       :repo: owner/repository
-       :path: path/to/file.rst
-       :default_branch: docs/develop  # Branch to use when not on a release
-       :tag_prefix: Docs/  # Optional
-    """
-
-    required_arguments = 0
-    optional_arguments = 0
-    final_argument_whitespace = True
-    has_content = False
-    option_spec = {
-        'repo': str,
-        'path': str,
-        'default_branch': str,  # Branch to use when not on a release tag
-        'start_line': int,      # Include the file from a specific line
-        'tag_prefix': str,      # Prefix for release tags (e.g., 'Docs/')
-    }
-
-    def get_current_version(self):
-        """Get current version/branch being built"""
-        env = self.state.document.settings.env
-        html_context = env.config.html_context
-
-        # Check if building from a tag
-        if "official_branch" in html_context:
-            if html_context["official_branch"] == 0:
-                if "version" in html_context:
-                    # Remove any 'v' prefix
-                    version = html_context["version"]
-                    if re.match(r'^\d+\.\d+\.\d+$', version):
-                        return version
-
-        # Not a version tag, so we'll use the default branch
-        return None
-
-    def get_target_ref(self):
-        """Get target reference for the remote repository"""
-        current_version = self.get_current_version()
-
-        # If it's a version number, use tag prefix and version
-        if current_version:
-            tag_prefix = self.options.get('tag_prefix', '')
-            return f'{tag_prefix}{current_version}'
-
-        # For any other case, use the specified default branch
-        if 'default_branch' not in self.options:
-            logger.warning('No default_branch specified and not building from a version tag')
-            return None
-
-        return self.options['default_branch']
-
-    def construct_raw_url(self, repo, path, ref):
-        """Construct the raw.githubusercontent.com URL"""
-        return f'https://raw.githubusercontent.com/{repo}/{ref}/{path}'
-
-    def fetch_and_parse_content(self, url, source_path):
-        """Fetch content and parse it as RST"""
-        response = requests.get(url)
-        response.raise_for_status()
-        content = response.text
-
-        start_line = self.options.get('start_line', 0)
-
-        # Create ViewList for parsing
-        line_count = 0
-        content_list = ViewList()
-        for line_no, line in enumerate(content.splitlines()):
-            if line_count >= start_line:
-                content_list.append(line, source_path, line_no)
-            line_count+=1 
-
-        # Create a section node and parse content
-        node = nodes.section()
-        nested_parse_with_titles(self.state, content_list, node)
-
-        return node.children
-
-    def run(self):
-        if 'repo' not in self.options or 'path' not in self.options:
-            logger.warning('Both repo and path options are required')
-            return []
-
-        target_ref = self.get_target_ref()
-        if not target_ref:
-            return []
-
-        raw_url = self.construct_raw_url(
-            self.options['repo'],
-            self.options['path'],
-            target_ref
-        )
-
-        try:
-            logger.info(f'Attempting to fetch content from {raw_url}')
-            return self.fetch_and_parse_content(raw_url, self.options['path'])
-        except requests.exceptions.RequestException as e:
-            logger.warning(f'Failed to fetch content from {raw_url}: {str(e)}')
-
-            # If we failed on a tag, try falling back to default_branch
-            if re.match(r'^\d+\.\d+\.\d+$', target_ref) or target_ref.startswith('Docs/'):
-                if 'default_branch' in self.options:
-                    try:
-                        fallback_ref = self.options['default_branch']
-                        logger.info(f'Attempting fallback to {fallback_ref}...')
-
-                        fallback_url = self.construct_raw_url(
-                            self.options['repo'],
-                            self.options['path'],
-                            fallback_ref
-                        )
-
-                        return self.fetch_and_parse_content(fallback_url, self.options['path'])
-                    except requests.exceptions.RequestException as e2:
-                        logger.warning(f'Fallback also failed: {str(e2)}')
-
-            return []
-
-def setup(app):
-    app.add_directive('remote-content', BranchAwareRemoteContent)
-
-    return {
-        'parallel_read_safe': True,
-        'parallel_write_safe': True,
-    }
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -10,7 +10,7 @@ Deep learning frameworks provide environments for machine learning, training, fi

 ROCm offers a complete ecosystem for developing and running deep learning applications efficiently. It also provides ROCm-compatible versions of popular frameworks and libraries, such as PyTorch, TensorFlow, JAX, and others.

-The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs.
+The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs and accelerators.

 The table below summarizes information about ROCm-enabled deep learning frameworks. It includes details on ROCm compatibility and third-party tool support, installation steps and options, and links to GitHub resources. For a complete list of supported framework versions on ROCm, see the :doc:`Compatibility matrix <../compatibility/compatibility-matrix>` topic.

@@ -84,8 +84,6 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-wheels-package>`__
-
      - .. raw:: html

          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 
@@ -130,22 +128,10 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#build-your-own-docker-image>`__
      - .. raw:: html

          <a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>

-    * - `FlashInfer <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/flashinfer-compatibility.html>`__
-      - .. raw:: html
-
-          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html"><i class="fas fa-link fa-lg"></i></a>
-      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#use-a-prebuilt-docker-image-with-flashinfer-pre-installed>`__
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#build-your-own-docker-image>`__
-      - .. raw:: html
-
-          <a href="https://github.com/ROCm/flashinfer"><i class="fab fa-github fa-lg"></i></a>
-
 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.

--- a/docs/how-to/gpu-performance/mi300x.rst
+++ b/docs/how-to/gpu-performance/mi300x.rst
@@ -1,5 +1,5 @@
 .. meta::
-   :description: How to configure MI300X GPUs to fully leverage their capabilities and achieve optimal performance.
+   :description: How to configure MI300X accelerators to fully leverage their capabilities and achieve optimal performance.
   :keywords: ROCm, AI, machine learning, MI300X, LLM, usage, tutorial, optimization, tuning

 **************************************
@@ -7,11 +7,11 @@ AMD Instinct MI300X performance guides
 **************************************

 The following performance guides provide essential guidance on the necessary
-steps to properly `configure your system for AMD Instinct™ MI300X GPUs
+steps to properly `configure your system for AMD Instinct™ MI300X accelerators
 <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 They include detailed instructions on system settings and application
 :doc:`workload tuning </how-to/rocm-for-ai/inference-optimization/workload>` to
-help you leverage the maximum capabilities of these GPUs and achieve
+help you leverage the maximum capabilities of these accelerators and achieve
 superior performance.

 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
@@ -19,9 +19,9 @@ superior performance.
  your AMD Instinct MI300X system for performance.

 * :doc:`/how-to/rocm-for-ai/inference-optimization/workload` covers steps to
-  optimize the performance of AMD Instinct MI300X Series GPUs for HPC
+  optimize the performance of AMD Instinct MI300X series accelerators for HPC
  and deep learning operations.

 * :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` introduces a preconfigured
  environment for LLM inference, designed to help you test performance with
-  popular models on AMD Instinct MI300X Series GPUs.
+  popular models on AMD Instinct MI300X series accelerators.
--- a/docs/how-to/programming_guide.rst
+++ b/docs/how-to/programming_guide.rst
@@ -25,7 +25,7 @@ execute on AMD GPUs while maintaining compatibility with CUDA-based systems.
 OpenCL (Open Computing Language) is an open standard for cross-platform,
 parallel programming of diverse processors. ROCm supports OpenCL for developers
 who want to use standard frameworks across different hardware platforms,
-including CPUs, GPUs, and APUs. For more information, see
+including CPUs, GPUs, and other accelerators. For more information, see
 `OpenCL <https://www.khronos.org/opencl/>`_.

 Python bindings can be found at https://github.com/ROCm/hip-python.
--- a/docs/how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference.rst
@@ -11,10 +11,10 @@ Fine-tuning using ROCm involves leveraging AMD's GPU-accelerated :doc:`libraries
 ecosystem for deep learning development, including open-source libraries for optimized deep learning operations and
 ROCm-aware versions of :doc:`deep learning frameworks <../../deep-learning-rocm>` such as PyTorch, TensorFlow, and JAX.

-Single-accelerator systems, such as a machine equipped with a single GPU, are commonly used for
+Single-accelerator systems, such as a machine equipped with a single accelerator or GPU, are commonly used for
 smaller-scale deep learning tasks, including fine-tuning pre-trained models and running inference on moderately
 sized datasets. See :doc:`single-gpu-fine-tuning-and-inference`.

-Multi-accelerator systems, on the other hand, consist of multiple GPUs working in parallel. These systems are
+Multi-accelerator systems, on the other hand, consist of multiple accelerators working in parallel. These systems are
 typically used in LLMs and other large-scale deep learning tasks where performance, scalability, and the handling of
 massive datasets are crucial. See :doc:`multi-gpu-fine-tuning-and-inference`.
--- a/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
@@ -3,11 +3,11 @@
   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, multi-GPU, distributed, inference, accelerators, PyTorch, HuggingFace, torchtune

 *****************************************************
-Fine-tuning and inference using multiple GPUs
+Fine-tuning and inference using multiple accelerators
 *****************************************************

 This section explains how to fine-tune a model on a multi-accelerator system. See
-:doc:`Single-accelerator fine-tuning <single-gpu-fine-tuning-and-inference>` for a single GPU setup.
+:doc:`Single-accelerator fine-tuning <single-gpu-fine-tuning-and-inference>` for a single accelerator or GPU setup.

 .. _fine-tuning-llms-multi-gpu-env:

@@ -20,7 +20,7 @@ This section was tested using the following hardware and software environment.
   :stub-columns: 1

   * - Hardware
-     - 4 AMD Instinct MI300X GPUs
+     - 4 AMD Instinct MI300X accelerators

   * - Software
     - ROCm 6.1, Ubuntu 22.04, PyTorch 2.1.2, Python 3.10
@@ -40,13 +40,13 @@ Setting up the base implementation environment
   :doc:`PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`. For consistent
   installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.

-#. In the Docker container, check the availability of ROCm-capable GPUs using the following command.
+#. In the Docker container, check the availability of ROCM-capable accelerators using the following command.

   .. code-block:: shell

      rocm-smi --showproductname

-#. Check that your GPUs are available to PyTorch.
+#. Check that your accelerators are available to PyTorch.

   .. code-block:: python

@@ -66,7 +66,7 @@ Setting up the base implementation environment
 .. tip::

   During training and inference, you can check the memory usage by running the ``rocm-smi`` command in your terminal.
-   This tool helps you see shows which GPUs are involved.
+   This tool helps you see shows which accelerators or GPUs are involved.


 .. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
@@ -74,9 +74,9 @@ Setting up the base implementation environment
 Hugging Face Accelerate for fine-tuning and inference
 ===========================================================

-`Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`__ is a library that simplifies turning raw
-PyTorch code for a single GPU into code for multiple GPUs for LLM fine-tuning and inference. It is
-integrated with `Transformers <https://huggingface.co/docs/transformers/en/index>`__, so you can scale your PyTorch
+`Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_ is a library that simplifies turning raw
+PyTorch code for a single accelerator into code for multiple accelerators for LLM fine-tuning and inference. It is
+integrated with `Transformers <https://huggingface.co/docs/transformers/en/index>`_ allowing you to scale your PyTorch
 code while maintaining performance and flexibility.

 As a brief example of model fine-tuning and inference using multiple GPUs, let's use Transformers and load in the Llama
@@ -107,7 +107,7 @@ Now, it's important to adjust how you load the model. Add the ``device_map`` par
   (``"auto"``, ``"balanced"``, ``"balanced_low_0"``, ``"sequential"``).

   It's recommended to set the ``device_map`` parameter to ``“auto”`` to allow Accelerate to automatically and
-   efficiently allocate the model given the available resources (four GPUs in this case).
+   efficiently allocate the model given the available resources (4 accelerators in this case).

   When you have more GPU memory available than the model size, here is the difference between each ``device_map``
   option:
@@ -130,8 +130,8 @@ After loading the model in this way, the model is fully ready to use the resourc
 torchtune for fine-tuning and inference
 =============================================

-`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU 
-model fine-tuning and inference with LLMs.
+`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-accelerator or
+GPU model fine-tuning and inference with LLMs.

 #. Install torchtune using pip.

--- a/docs/how-to/rocm-for-ai/fine-tuning/overview.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/overview.rst
@@ -30,7 +30,7 @@ The challenge of fine-tuning models

 However, the computational cost of fine-tuning is still high, especially for complex models and large datasets, which
 poses distinct challenges related to substantial computational and memory requirements. This might be a barrier for
-GPUs with low computing power or limited device memory resources.
+accelerators or GPUs with low computing power or limited device memory resources.

 For example, suppose we have a language model with 7 billion (7B) parameters, represented by a weight matrix :math:`W`.
 During backpropagation, the model needs to learn a :math:`ΔW` matrix, which updates the original weights to minimize the
@@ -84,8 +84,8 @@ Walkthrough
 ===========

 To demonstrate the benefits of LoRA and the ideal compute compatibility of using PEFT and TRL libraries on AMD
-ROCm-compatible GPUs, let's step through a comprehensive implementation of the fine-tuning process
-using the Llama 2 7B model with LoRA tailored specifically for question-and-answer tasks on AMD MI300X GPUs.
+ROCm-compatible accelerators and GPUs, let's step through a comprehensive implementation of the fine-tuning process
+using the Llama 2 7B model with LoRA tailored specifically for question-and-answer tasks on AMD MI300X accelerators.

 Before starting, review and understand the key components of this walkthrough:

--- a/docs/how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
@@ -3,11 +3,12 @@
   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, single-GPU, LoRA, PEFT, inference, SFTTrainer

 ****************************************************
-Fine-tuning and inference using a single GPU
+Fine-tuning and inference using a single accelerator
 ****************************************************

 This section explains model fine-tuning and inference techniques on a single-accelerator system. See
-:doc:`Multi-accelerator fine-tuning <multi-gpu-fine-tuning-and-inference>` for a setup with multiple GPUs.
+:doc:`Multi-accelerator fine-tuning <multi-gpu-fine-tuning-and-inference>` for a setup with multiple accelerators or
+GPUs.

 .. _fine-tuning-llms-single-gpu-env:

@@ -20,7 +21,7 @@ This section was tested using the following hardware and software environment.
   :stub-columns: 1

   * - Hardware
-     - AMD Instinct MI300X GPU
+     - AMD Instinct MI300X accelerator

   * - Software
     - ROCm 6.1, Ubuntu 22.04, PyTorch 2.1.2, Python 3.10
@@ -40,7 +41,7 @@ Setting up the base implementation environment
   :doc:`PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`. For a consistent
   installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.

-#. In the Docker container, check the availability of ROCm-capable GPUs using the following command.
+#. In the Docker container, check the availability of ROCm-capable accelerators using the following command.

   .. code-block:: shell

@@ -52,14 +53,14 @@ Setting up the base implementation environment

      ============================ ROCm System Management Interface ============================
      ====================================== Product Info ======================================
-      GPU[0]          : Card Series:          AMD Instinct MI300X OAM
+      GPU[0]          : Card series:          AMD Instinct MI300X OAM
      GPU[0]          : Card model:           0x74a1
      GPU[0]          : Card vendor:          Advanced Micro Devices, Inc. [AMD/ATI]
      GPU[0]          : Card SKU:             MI3SRIOV
      ==========================================================================================
      ================================== End of ROCm SMI Log ===================================

-#. Check that your GPUs are available to PyTorch.
+#. Check that your accelerators are available to PyTorch.

   .. code-block:: python

@@ -501,9 +502,9 @@ Let's look at achieving model inference using these types of models.
         # Token generation
         print(pipe("What is a large language model?")[0]["generated_text"])

-If using multiple GPUs, see
+If using multiple accelerators, see
 :ref:`Multi-accelerator fine-tuning and inference <fine-tuning-llms-multi-gpu-hugging-face-accelerate>` to explore
-popular libraries that simplify fine-tuning and inference in a multiple-GPU system.
+popular libraries that simplify fine-tuning and inference in a multi-accelerator system.

 Read more about inference frameworks like vLLM and Hugging Face TGI in
 :doc:`LLM inference frameworks <../inference/llm-inference-frameworks>`.
--- a/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
@@ -45,7 +45,7 @@ ROCm provides two different implementations of Flash Attention 2 modules. They c
         # Install from source
         git clone https://github.com/ROCm/flash-attention.git
         cd flash-attention/
-         GPU_ARCHS=gfx942 python setup.py install #MI300 Series
+         GPU_ARCHS=gfx942 python setup.py install #MI300 series

      Hugging Face Transformers can easily deploy the CK Flash Attention 2 module by passing an argument
      ``attn_implementation="flash_attention_2"`` in the ``from_pretrained`` class.
@@ -526,7 +526,7 @@ follow these instructions:
   python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning split_table_batched_embeddings_test.py

 To run the FBGEMM_GPU ``uvm`` test, use these commands. These tests only support the AMD MI210 and 
-more recent GPUs. 
+more recent accelerators. 

 .. code-block:: shell

--- a/docs/how-to/rocm-for-ai/inference-optimization/model-quantization.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/model-quantization.rst
@@ -7,7 +7,7 @@ Model quantization techniques
 *****************************

 Quantization reduces the model size compared to its native full-precision version, making it easier to fit large models
-onto GPUs with limited memory usage. This section explains how to perform LLM quantization using AMD Quark, GPTQ
+onto accelerators or GPUs with limited memory usage. This section explains how to perform LLM quantization using AMD Quark, GPTQ
 and bitsandbytes on AMD Instinct hardware.

 .. _quantize-llms-quark:
@@ -311,7 +311,7 @@ ExLlama-v2 support
 ExLlama is a Python/C++/CUDA implementation of the Llama model that is
 designed for faster inference with 4-bit GPTQ weights. The ExLlama
 kernel is activated by default when users create a ``GPTQConfig`` object. To
-boost inference speed even further on Instinct GPUs, use the ExLlama-v2
+boost inference speed even further on Instinct accelerators, use the ExLlama-v2
 kernels by configuring the ``exllama_config`` parameter as the following.

 .. code-block:: python
@@ -332,7 +332,7 @@ The `ROCm-aware bitsandbytes <https://github.com/ROCm/bitsandbytes>`_ library is
 a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizer, matrix multiplication, and
 8-bit and 4-bit quantization functions. The library includes quantization primitives for 8-bit and 4-bit operations
 through ``bitsandbytes.nn.Linear8bitLt`` and ``bitsandbytes.nn.Linear4bit`` and 8-bit optimizers through the
-``bitsandbytes.optim`` module. These modules are supported on AMD Instinct GPUs.
+``bitsandbytes.optim`` module. These modules are supported on AMD Instinct accelerators.

 Installing bitsandbytes
 -----------------------
--- a/docs/how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel.md
+++ b/docs/how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel.md
@@ -9,13 +9,13 @@ myst:

 The AMD ROCm Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.

-This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X GPUs using CK.
+This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X accelerators using CK.

 ## High-level overview: a CK GEMM instance

 GEMM is a fundamental block in linear algebra, machine learning, and deep neural networks. It is defined as the operation:
 {math}`E = α \times (A \times B) + β \times (D)`, with A and B as matrix inputs, α and β as scalar inputs, and D as a pre-existing matrix.
-Take the commonly used linear transformation in a fully connected layer as an example. These terms correspond to input activation (A), weight (B), bias (D), and output (E), respectively. The example employs a `DeviceGemmMultipleD_Xdl_CShuffle` struct from CK library as the fundamental instance to explore the compute capability of AMD Instinct GPUs for the computation of GEMM. The implementation of the instance contains two phases:
+Take the commonly used linear transformation in a fully connected layer as an example. These terms correspond to input activation (A), weight (B), bias (D), and output (E), respectively. The example employs a `DeviceGemmMultipleD_Xdl_CShuffle` struct from CK library as the fundamental instance to explore the compute capability of AMD Instinct accelerators for the computation of GEMM. The implementation of the instance contains two phases:

 - [Template parameter definition](#template-parameter-definition)
 - [Instantiating and running the templated kernel](#instantiating-and-running-the-templated-kernel)
@@ -108,7 +108,7 @@ These parameters include Block Size, M/N/K Per Block, M/N per XDL, AK1, BK1, etc

 - Block Size determines the number of threads in the thread block.
 - M/N/K Per Block determines the size of tile that each thread block is responsible for calculating.
- M/N Per XDL refers to M/N size for Instinct GPU Matrix Fused Multiply Add (MFMA) instructions operating on a per-wavefront basis.
+- M/N Per XDL refers to M/N size for Instinct accelerator Matrix Fused Multiply Add (MFMA) instructions operating on a per-wavefront basis.
 - A/B K1 is related to the data type. It can be any value ranging from 1 to K Per Block. To achieve the optimal load/store performance, 128bit per load is suggested. In addition, the A/B loading parameters must be changed accordingly to match the A/B K1 value; otherwise, it will result in compilation errors.

 Conditions for achieving computational load balancing on different hardware platforms can vary.
@@ -133,7 +133,7 @@ Templated kernel launching consists of kernel instantiation, making arguments by

 ## Developing fused INT8 kernels for SmoothQuant models

-[SmoothQuant](https://github.com/mit-han-lab/smoothquant) (SQ) is a quantization algorithm that enables an INT8 quantization of both weights and activations for all the matrix multiplications in LLM. The required GPU kernel functionalities used to accelerate the inference of SQ models on Instinct GPUs are shown in the following table.
+[SmoothQuant](https://github.com/mit-han-lab/smoothquant) (SQ) is a quantization algorithm that enables an INT8 quantization of both weights and activations for all the matrix multiplications in LLM. The required GPU kernel functionalities used to accelerate the inference of SQ models on Instinct accelerators are shown in the following table.

 :::{table} Functionalities used to implement SmoothQuant model inference.

@@ -164,7 +164,7 @@ The CK library contains many fundamental instances that implement different func

 Second, consider whether the format of input data meets your actual calculation needs. For SQ models, the 8-bit integer data format (INT8) is applied for matrix calculations.

-Third, consider the platform for implementing CK instances. The instances suffixed with `xdl` only run on AMD Instinct GPUs after being compiled and cannot run on Radeon-Series GPUs. This is due to the underlying device-specific instruction sets for implementing these basic instances.
+Third, consider the platform for implementing CK instances. The instances suffixed with `xdl` only run on AMD Instinct accelerators after being compiled and cannot run on Radeon-series GPUs. This is due to the underlying device-specific instruction sets for implementing these basic instances.

 Here, we use [DeviceBatchedGemmMultiD_Xdl](https://github.com/ROCm/composable_kernel/tree/develop/example/24_batched_gemm) as the fundamental instance to implement the functionalities in the previous table.

@@ -435,7 +435,7 @@ The implementation architecture of running SmoothQuant models on MI300X GPUs is
 ### Figure 7
 ================ -->
 ```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
-The implementation architecture of running SmoothQuant models on AMD MI300X GPUs.
+The implementation architecture of running SmoothQuant models on AMD MI300X accelerators.
 ```

 For the target [SQ quantized model](https://huggingface.co/mit-han-lab/opt-13b-smoothquant), each decoder layer contains three major components: attention calculation, layer normalization, and linear transformation in fully connected layers.  The corresponding implementation classes for these components are:
@@ -447,21 +447,21 @@ For the target [SQ quantized model](https://huggingface.co/mit-han-lab/opt-13b-s
 These classes' underlying implementation logits will harness the functions in previous table. Note that for the example, the `LayerNormQ` module is implemented by the torch native module.

 Testing environment:
-The hardware platform used for testing equips with 256 AMD EPYC 9534 64-Core Processor, 8 AMD Instinct MI300X GPUs and 1.5T memory. The testing was done in a publicly available Docker image from Docker Hub:
+The hardware platform used for testing equips with 256 AMD EPYC 9534 64-Core Processor, 8 AMD Instinct MI300X accelerators and 1.5T memory. The testing was done in a publicly available Docker image from Docker Hub:
 [`rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2`](https://hub.docker.com/layers/rocm/pytorch/rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2/images/sha256-f6ea7cee8aae299c7f6368187df7beed29928850c3929c81e6f24b34271d652b)

 The tested models are OPT-1.3B, 2.7B, 6.7B and 13B FP16 models and the corresponding SmoothQuant INT8 OPT models were obtained from Hugging Face.

 Note that since the default values were used for the tunable parameters of the fundamental instance, the performance of the INT8 kernel is suboptimal.

-Figure 8 shows the performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X GPU. The GPU memory footprints of SmoothQuant-quantized models are significantly reduced. It also indicates the per-sample inference latency is significantly reduced for all SmoothQuant-quantized OPT models (illustrated in (b)). Notably, the performance of the CK instance-based INT8 kernel steadily improves with an increase in model size.
+Figure 8 shows the performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator. The GPU memory footprints of SmoothQuant-quantized models are significantly reduced. It also indicates the per-sample inference latency is significantly reduced for all SmoothQuant-quantized OPT models (illustrated in (b)). Notably, the performance of the CK instance-based INT8 kernel steadily improves with an increase in model size.

 <!-- 
 ================
 ### Figure 8
 ================ -->
 ```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
-Performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X GPU.
+Performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator.
 ```

 For accuracy comparisons between the original FP16 and INT8 models, the evaluation is done by using the first 1,000 samples from the LAMBADA dataset's validation set. We employ the same Last Token Prediction Accuracy method introduced in [SmoothQuant Real-INT8 Inference for PyTorch](https://github.com/mit-han-lab/smoothquant/blob/main/examples/smoothquant_opt_real_int8_demo.ipynb) as our evaluation metric. The comparison results are shown in Table 2.
@@ -482,4 +482,4 @@ CK provides a rich set of template parameters for generating flexible accelerate

 CK supports multiple instruction sets of AMD Instinct GPUs, operator fusion and different data precisions. Its composability helps users quickly construct operator performance verification.

-With CK, you can build more effective AI applications with higher flexibility and better performance on different AMD GPU platforms.
+With CK, you can build more effective AI applications with higher flexibility and better performance on different AMD accelerator platforms.
--- a/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -1,30 +1,31 @@
 .. meta::
-   :description: Learn about workload tuning on AMD Instinct MI300X GPUs for optimal performance.
+   :description: Learn about workload tuning on AMD Instinct MI300X accelerators for optimal performance.
   :keywords: AMD, Instinct, MI300X, HPC, tuning, BIOS settings, NBIO, ROCm,
              environment variable, performance, HIP, Triton, PyTorch TunableOp, vLLM, RCCL,
-              MIOpen, GPU, resource utilization
+              MIOpen, accelerator, GPU, resource utilization

 *****************************************
 AMD Instinct MI300X workload optimization
 *****************************************

 This document provides guidelines for optimizing the performance of AMD
-Instinct™ MI300X GPUs, with a particular focus on GPU kernel
+Instinct™ MI300X accelerators, with a particular focus on GPU kernel
 programming, high-performance computing (HPC), and deep learning operations
 using PyTorch. It delves into specific workloads such as
 :ref:`model inference <mi300x-vllm-optimization>`, offering strategies to
 enhance efficiency.

-The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>` as
-well as :ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>`
-for meticulous tuning.
+The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>`
+that streamline optimization as well as advanced techniques like
+:ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>` for
+meticulous tuning.

 Workload tuning strategy
 ========================

 By following a structured approach, you can systematically address
 performance issues and enhance the efficiency of your workloads on AMD Instinct
-MI300X GPUs.
+MI300X accelerators.

 Measure the current workload
 ----------------------------
@@ -85,28 +86,27 @@ Optimize model inference with vLLM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 vLLM provides tools and techniques specifically designed for efficient model
-inference on AMD Instinct GPUs. See the official `vLLM installation docs
-<https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html>`__ for
-installation guidance. Optimizing performance with vLLM involves configuring
-tensor parallelism, leveraging advanced features, and ensuring efficient
-execution.
+inference on AMD Instinct MI300X accelerators. See :ref:`fine-tuning-llms-vllm`
+for installation guidance. Optimizing performance with vLLM
+involves configuring tensor parallelism, leveraging advanced features, and
+ensuring efficient execution. Here’s how to optimize vLLM performance:

-* Configuration for vLLM: Set engine arguments according to workload
-  requirements.
+* Tensor parallelism: Configure the
+  :ref:`tensor-parallel-size parameter <mi300x-vllm-multiple-gpus>` to distribute
+  tensor computations across multiple GPUs. Adjust parameters such as
+  ``batch-size``, ``input-len``, and ``output-len`` based on your workload.
+
+* Configuration for vLLM: Set :ref:`parameters <mi300x-vllm-optimization>`
+  according to workload requirements. Benchmark performance to understand
+  characteristics and identify bottlenecks.

 * Benchmarking and performance metrics: Measure latency and throughput to
  evaluate performance.

-.. seealso::
-
-   See :doc:`vllm-optimization` to learn more about vLLM performance
-   optimization techniques.
-
 .. _mi300x-auto-tune:

 Auto-tunable configurations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
 Auto-tunable configurations can significantly streamline performance
 optimization by automatically adjusting parameters based on workload
 characteristics. For example:
@@ -120,7 +120,8 @@ characteristics. For example:
  your specific hardware.

 * Triton: Use :ref:`Triton’s auto-tuning features <mi300x-autotunable-kernel-config>`
-  to explore various kernel configurations and select the best-performing ones.
+  to explore various kernel configurations and automatically select the
+  best-performing ones.

 Manual tuning
 ^^^^^^^^^^^^^
@@ -238,7 +239,7 @@ benchmarking process.

 With AMD's profiling tools, developers are able to gain important insight into how efficiently their application is
 using hardware resources and effectively diagnose potential bottlenecks contributing to poor performance. Developers
-working with AMD Instinct GPUs have multiple tools depending on their specific profiling needs; these include:
+working with AMD Instinct accelerators have multiple tools depending on their specific profiling needs; these include:

 * :ref:`ROCProfiler <mi300x-rocprof>`

@@ -256,11 +257,11 @@ metrics, commonly called *performance counters*. These counters quantify the per
 showcasing which pieces of the computational pipeline and memory hierarchy are being utilized.

 Your ROCm installation contains a script or executable command called ``rocprof`` which provides the ability to list all
-available hardware counters for your specific GPU, and run applications while collecting counters during
+available hardware counters for your specific accelerator or GPU, and run applications while collecting counters during
 their execution.

 This ``rocprof`` utility also depends on the :doc:`ROCTracer and ROC-TX libraries <roctracer:index>`, giving it the
-ability to collect timeline traces of the GPU software stack as well as user-annotated code regions.
+ability to collect timeline traces of the accelerator software stack as well as user-annotated code regions.

 .. note::

@@ -275,16 +276,16 @@ ROCm Compute Profiler
 ^^^^^^^^^^^^^^^^^^^^^

 :doc:`ROCm Compute Profiler <rocprofiler-compute:index>` is a system performance profiler for high-performance computing (HPC) and
-machine learning (ML) workloads using Instinct GPUs. Under the hood, ROCm Compute Profiler uses
+machine learning (ML) workloads using Instinct accelerators. Under the hood, ROCm Compute Profiler uses
 :ref:`ROCProfiler <mi300x-rocprof>` to collect hardware performance counters. The ROCm Compute Profiler tool performs
 system profiling based on all approved hardware counters for Instinct
-GPU architectures. It provides high level performance analysis features including System Speed-of-Light, IP
+accelerator architectures. It provides high level performance analysis features including System Speed-of-Light, IP
 block Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more.

 ROCm Compute Profiler takes the guesswork out of profiling by removing the need to provide text input files with lists of counters
 to collect and analyze raw CSV output files as is the case with ROCProfiler. Instead, ROCm Compute Profiler automates the collection
 of all available hardware counters in one command and provides graphical interfaces to help users understand and
-analyze bottlenecks and stressors for their computational workloads on AMD Instinct GPUs.
+analyze bottlenecks and stressors for their computational workloads on AMD Instinct accelerators.

 .. note::

@@ -327,21 +328,380 @@ hardware counters are also included.

   ROCm Systems Profiler timeline trace example.

+.. _mi300x-vllm-optimization:
+
 vLLM performance optimization
 =============================

-vLLM is a high-throughput and memory efficient inference and serving engine for
-large language models that has gained traction in the AI community for its
-performance and ease of use. See :doc:`vllm-optimization`, where you'll learn
-how to:
+vLLM is a high-throughput and memory efficient inference and serving engine for large language models that has gained traction in the AI community for
+its performance and ease of use. See :ref:`fine-tuning-llms-vllm` for a primer on vLLM with ROCm.
+
+Performance environment variables
+---------------------------------
+
+The following performance tips are not *specific* to vLLM -- they are general
+but relevant in this context. You can tune the following vLLM parameters to
+achieve optimal request latency and throughput performance.
+
+* As described in `Environment variables (MI300X)
+  <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
+  the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
+  performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.
+
+* Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
+  to ``112`` to increase the number of channels on MI300X to potentially improve
+  performance.
+
+* Set the environment variable ``TORCH_BLAS_PREFER_HIPBLASLT=1`` to use hipBLASLt to improve performance.
+
+Auto-tuning using PyTorch TunableOp
+------------------------------------
+
+Since vLLM is based on the PyTorch framework, PyTorch TunableOp can be used for auto-tuning. 
+You can run auto-tuning with TunableOp in two simple steps without modifying your code:
+
+* Enable TunableOp and tuning. Optionally, enable verbose mode:
+
+  .. code-block:: shell
+
+     PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_VERBOSE=1 your_vllm_script.sh
+
+* Enable TunableOp and disable tuning and measure.
+
+  .. code-block:: shell
+
+     PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_vllm_script.sh
+
+Learn more about TunableOp in the :ref:`PyTorch TunableOp <mi300x-tunableop>` section.
+
+Performance tuning based on vLLM engine configurations
+-------------------------------------------------------
+
+The following subsections describe vLLM-specific configurations for performance tuning.
+You can tune the following vLLM parameters to achieve optimal performance.
+
+*  ``tensor_parallel_size``
+
+*  ``gpu_memory_utilization``
+
+*  ``dtype``
+
+*  ``enforce_eager``
+
+*  ``kv_cache_dtype``
+
+*  ``input_len``
+
+*  ``output_len``
+
+*  ``max_num_seqs``
+
+*  ``num_scheduler_steps``
+
+*  ``max_model_len``
+
+*  ``enable_chunked_prefill``
+
+*  ``distributed_executor_backend``
+
+*  ``max_seq_len_to_capture``
+
+Refer to `vLLM documentation <https://docs.vllm.ai/en/latest/models/performance.html>`_
+for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
+usage with ROCm.
+
+ROCm provides a prebuilt optimized Docker image for validating the performance
+of LLM inference with vLLM on MI300X series accelerators. The Docker image includes
+ROCm, vLLM, and PyTorch. For more information, see
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
+
+.. _mi300x-vllm-throughput-measurement:
+
+Evaluating performance by throughput measurement
+-------------------------------------------------
+
+This tuning guide evaluates the performance of LLM inference workloads by measuring throughput in tokens per second (TPS). Throughput can be assessed using both real-world and synthetic data, depending on your evaluation goals.
+
+Refer to the benchmarking script located at ``benchmarks/benchmark_throughput.py`` in the `vLLM repository <https://github.com/ROCm/vllm/blob/main/benchmarks/benchmark_throughput.py>`_.
+Use this script to measure throughput effectively. You can assess throughput using real-world and synthetic data, depending on your evaluation goals.
+
+* For realistic performance evaluation, you can use datasets like Hugging Face's
+  ``ShareGPT_V3_unfiltered_cleaned_split.json``. This dataset includes real-world conversational
+  data, making it a good representation of typical use cases for language models. Download it using
+  the following command:
+
+  .. code-block:: shell
+
+     wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+* For standardized benchmarking, you can set fixed input and output token
+  lengths. Synthetic prompts provide consistent benchmarking runs, making it
+  easier to compare performance across different models or configurations.
+  Additionally, a controlled environment simplifies analysis.
+
+By balancing real-world data and synthetic data approaches, you can get a well-rounded understanding of model performance in varied scenarios.
+
+.. _mi300x-vllm-single-node:
+
+Maximizing vLLM instances on a single node
+------------------------------------------
+
+The general guideline is to maximize per-node throughput by running as many vLLM instances as possible.
+However, running too many instances might lead to insufficient memory for the KV-cache, which can affect performance.
+
+The Instinct MI300X accelerator is equipped with 192GB of HBM3 memory capacity and bandwidth.
+For models that fit in one GPU -- to maximize the accumulated throughput -- you can run as many as eight vLLM instances
+simultaneously on one MI300X node (with eight GPUs). To do so, use the GPU isolation environment
+variable ``CUDA_VISIBLE_DEVICES``.
+
+For example, this script runs eight instances of vLLM for throughput benchmarking at the same time
+with a model that can fit in one GPU:
+
+.. code-block:: shell
+
+   for i in $(seq 0 7);
+   do
+       CUDA_VISIBLE_DEVICES="$i" python3 /app/vllm/benchmarks/benchmark_throughput.py -tp 1 --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
+   done
+
+The total throughput achieved by running ``N`` instances of vLLM is generally much higher than running a
+single vLLM instance across ``N`` GPUs simultaneously (that is, configuring ``tensor_parallel_size`` as N or
+using the ``-tp`` N option, where ``1 < N ≤ 8``).
+
+vLLM on MI300X accelerators can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on.
+Notable configurations include Llama2-70b and Llama3-70b models on a single MI300X GPU, and the Llama3.1 405b model can fit on one single node with 8 MI300X GPUs.
+
+.. _mi300x-vllm-gpu-memory-utilization:
+
+Configure the gpu_memory_utilization parameter
+----------------------------------------------
+
+There are two ways to increase throughput by configuring ``gpu-memory-utilization`` parameter.
+
+1. Increase ``gpu-memory-utilization`` to improve the throughput for a single instance as long as
+   it does not incur HIP or CUDA Out Of Memory. The default ``gpu-memory-utilization`` is 0.9.
+   You can set it to ``>0.9`` and ``<1``.
+
+   For example, below benchmarking command set the ``gpu-memory-utilization`` as 0.98, or 98%.
+
+   .. code-block:: shell
+
+      /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.98 --input-len 1024 --output-len 128 --model /path/to/model
+
+2. Decrease ``gpu-memory-utilization`` to maximize the number of vLLM instances on the same GPU.
+
+   Specify GPU memory utilization to run as many instances of vLLM as possible on a single
+   GPU. However, too many instances can result in no memory for KV-cache. For small models, run
+   multiple instances of vLLM on the same GPU by specifying a smaller ``gpu-memory-utilization`` -- as
+   long as it would not cause HIP Out Of Memory. 
+
+   For example, run two instances of the Llama3-8b model at the same time on a single GPU by specifying
+   ``--gpu-memory-utilization`` to 0.4 (40%) as follows (on GPU ``0``):
+
+   .. code-block:: shell
+
+      CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 
+      --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
+
+      CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 
+      --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
+
+See :ref:`vllm-engine-args` for other performance suggestions.
+
+.. _mi300x-vllm-multiple-gpus:
+
+Run vLLM on multiple GPUs
+-------------------------
+
+The two main reasons to use multiple GPUs are:
+
+*  The model size is too big to run vLLM using one GPU as it results HIP Out of Memory.
+
+*  To achieve better latency when using a single GPU is not desirable.
+
+To run one vLLM instance on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` option to
+specify multiple GPUs. Optionally, use the ``CUDA_VISIBLE_DEVICES`` environment variable to specify
+the GPUs.
+
+For example, you can use two GPUs to start an API server on port 8000:
+
+.. code-block:: shell
+
+   python -m vllm.entrypoints.api_server --model /path/to/model --dtype
+   float16 -tp 2 --port 8000 &
+
+To achieve both latency and throughput performance for serving, you can run multiple API servers on
+different GPUs by specifying different ports for each server and use ``CUDA_VISIBLE_DEVICES`` to
+specify the GPUs for each server, for example:
+
+.. code-block:: shell
+
+   CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model
+   /path/to/model --dtype float16 -tp 2 --port 8000 &
+
+   CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model
+   /path/to/model --dtype float16 -tp 2 --port 8001 &
+
+Choose an attention backend
+---------------------------
+
+vLLM on ROCm supports two attention backends, each suitable for different use cases and performance
+requirements:
+
+- **Triton Flash Attention** - For benchmarking, run vLLM scripts at
+  least once as a warm-up step so Triton can perform auto-tuning before
+  collecting benchmarking numbers. This is the default setting.
+
+- **Composable Kernel (CK) Flash Attention** - To use CK Flash Attention, specify
+  the environment variable as ``export VLLM_USE_TRITON_FLASH_ATTN=0``.
+
+
+Refer to :ref:`Model acceleration libraries <acceleration-flash-attention>`
+to learn more about Flash Attention with Triton or CK backends.
+
+.. _vllm-engine-args:
+
+vLLM engine arguments
+---------------------
+
+The following are configuration suggestions to potentially improve performance with vLLM. See
+`vLLM's engine arguments documentation <https://docs.vllm.ai/en/latest/serving/engine_args.html>`_
+for a full list of configurable engine arguments.
+
+Configure the max-num-seqs parameter
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Increase the ``max-num-seqs`` parameter from the default ``256`` to ``512`` (``--max-num-seqs
+512``). This increases the maximum number of sequences per iteration and can improve throughput.
+
+Use the float16 dtype
+^^^^^^^^^^^^^^^^^^^^^
+
+The default data type (``dtype``) is specified in the model’s configuration file. For instance, some models use ``torch.bfloat16`` as their default ``dtype``.
+Use float16 (``--dtype float16``) for better performance.
+
+Multi-step scheduling
+^^^^^^^^^^^^^^^^^^^^^
+
+Setting ``num-scheduler-steps`` for multi-step scheduling can increase performance. Set it between 10 to 15 (``--num-scheduler-steps 10``).
+
+Distributed executor backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The vLLM supports two modes of distributed executor backend: ``ray`` and ``mp``. When using the `<https://github.com/ROCm/vllm>`__ fork, using the ``mp``
+backend (``--distributed_executor_backend mp``) is recommended.
+
+Graph mode max-seq-len-to-capture
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Maximum sequence length covered by CUDA graphs. In the default mode (where ``enforce_eager`` is ``False``), when a sequence has context length
+larger than this, vLLM engine falls back to eager mode. The default is 8192.
+
+When working with models that support long context lengths, set the parameter ``--max-seq-len-to-capture`` to 16384.
+See this `vLLM blog <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`__ for details.
+
+An example of long context length model is Qwen2-7b.
+
+Whether to enable chunked prefill
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Another vLLM performance tip is to enable chunked prefill to improve
+throughput. Chunked prefill allows large prefills to be chunked into
+smaller chunks and batched together with decode requests.
+
+You can enable the feature by specifying ``--enable-chunked-prefill`` in the
+command line or setting ``enable_chunked_prefill=True`` in the LLM
+constructor. 
+
+As stated in `vLLM's documentation, <https://docs.vllm.ai/en/latest/models/performance.html#chunked-prefill>`__,
+you can tune the performance by changing ``max_num_batched_tokens``. By
+default, it is set to 512 and optimized for ITL (inter-token latency).
+Smaller ``max_num_batched_tokens`` achieves better ITL because there are
+fewer prefills interrupting decodes.
+Higher ``max_num_batched_tokens`` achieves better TTFT (time to the first
+token) as you can put more prefill to the batch.
+
+You might experience noticeable throughput improvements when
+benchmarking on a single GPU or 8 GPUs using the vLLM throughput
+benchmarking script along with the ShareGPT dataset as input.
+
+In the case of fixed ``input-len``/``output-len``, for some configurations,
+enabling chunked prefill increases the throughput. For some other
+configurations, the throughput may be worse and elicit a need to tune
+parameter ``max_num_batched_tokens`` (for example, increasing ``max_num_batched_tokens`` value to 4096 or larger).
+
+.. note::
+
+   Chunked prefill is no longer recommended. See the vLLM blog: `Serving LLMs on AMD MI300X: Best Practices <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`_ (October 2024).
+
+Quantization support
+---------------------
+
+Quantization reduces the precision of the model’s weights and activations, which significantly decreases the memory footprint.
+``fp8(w8a8)`` and ``AWQ`` quantization are supported for ROCm.
+
+FP8 quantization
+^^^^^^^^^^^^^^^^^
+
+`<https://github.com/ROCm/vllm>`__ supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on the Instinct MI300X.
+Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
+
+AMD publishes Quark Quantized OCP FP8 models on Hugging Face. For example:
+
+* `Llama-3.1-8B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>`__
+* `Llama-3.1-70B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`__
+* `Llama-3.1-405B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`__
+* `Mixtral-8x7B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`__
+* `Mixtral-8x22B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`__
+
+To enable vLLM benchmarking to run on fp8 quantized models, use the ``--quantization`` parameter with value ``fp8`` (``--quantization fp8``).
+
+AWQ quantization
+^^^^^^^^^^^^^^^^
+
+You can quantize your own models by installing AutoAWQ or picking one of the 400+ models on Hugging Face. Be aware that
+that AWQ support in vLLM is currently underoptimized.
+
+To enable vLLM to run on ``awq`` quantized models, using ``--quantization`` parameter with ``awq`` (``--quantization awq``).
+
+You can find more specifics in the `vLLM AutoAWQ documentation <https://docs.vllm.ai/en/stable/quantization/auto_awq.html>`_.
+
+fp8 kv-cached-dtype
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Using ``fp8 kv-cache dtype`` can improve performance as it reduces the size
+of ``kv-cache``. As a result, it reduces the cost required for reading and
+writing the ``kv-cache``.
+
+To use this feature, specify ``--kv-cache-dtype`` as ``fp8``.
+
+To specify the quantization scaling config, use the
+``--quantization-param-path`` parameter. If the parameter is not specified,
+the default scaling factor of ``1`` is used, which can lead to less accurate
+results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
+Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
+in the vLLM GitHub repository.
+
+Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
+``llama2-7b``.
+
+If building the vLLM using
+`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm>`_
+for ``llama2-70b`` scale config, find the file at
+``/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json`` at
+runtime.
+
+Below is a sample command to run benchmarking with this feature enabled
+for the ``llama2-70b`` model:
+
+.. code-block:: shell
+
+   python3 /vllm-workspace/benchmarks/benchmark_throughput.py --model \
+   /path/to/llama2-70b-model --kv-cache-dtype "fp8" \
+   --quantization-param-path \
+   "/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json" \
+   --input-len 512 --output-len 256 --num-prompts 500

-* Enable AITER (AI Tensor Engine for ROCm) to speed up on LLM models.
-* Configure environment variables for optimal HIP, RCCL, and Quick Reduce performance.
-* Select the right attention backend for your workload (AITER MHA/MLA vs. Triton).
-* Choose parallelism strategies (tensor, pipeline, data, expert) for multi-GPU deployments.
-* Apply quantization (``FP8``/``FP4``) to reduce memory usage by 2-4× with minimal accuracy loss.
-* Tune engine arguments (batch size, memory utilization, graph modes) for your use case.
-* Benchmark and scale across single-node and multi-node configurations.

 .. _mi300x-tunableop:

@@ -557,7 +917,7 @@ ROCm library tuning involves optimizing the performance of routine computational
 operations (such as ``GEMM``) provided by ROCm libraries like
 :ref:`hipBLASLt <mi300x-hipblaslt>`, :ref:`Composable Kernel <mi300x-ck>`,
 :ref:`MIOpen <mi300x-miopen>`, and :ref:`RCCL <mi300x-rccl>`. This tuning aims
-to maximize efficiency and throughput on Instinct MI300X GPUs to gain 
+to maximize efficiency and throughput on Instinct MI300X accelerators to gain 
 improved application performance.

 .. _mi300x-library-gemm:
@@ -586,33 +946,33 @@ for details.

  .. code-block:: shell

-     HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
+     HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
     --a_type f16_r --b_type f8_r --compute_type f32_f16_r \
-     --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256
+     --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256

 * Example 2: Benchmark forward epilogues and backward epilogues

-  *  ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
+  *  ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``

-  *  ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
+  *  ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``

  *  ``HIPBLASLT_EPILOGUE_DGELU": --activation_type gelu --gradient";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``

-  *  ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
+  *  ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``

-  *  ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
+  *  ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``

-  *  ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";``
+  *  ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";``


 hipBLASLt auto-tuning using hipblaslt-bench
@@ -671,26 +1031,26 @@ The tuning tool is a two-step tool. It first runs the benchmark, then it creates

  .. code-block:: python

-     defaultBenchOptions = {"ProblemType": {
-         "TransposeA": 0,
-         "TransposeB": 0,
-         "ComputeInputDataType": "s",
-         "ComputeDataType": "s",
-         "DataTypeC": "s",
-         "DataTypeD": "s",
-         "UseBias": False
-     }, "TestConfig": {
-         "ColdIter": 20,
-         "Iter": 100,
-         "AlgoMethod": "all",
-         "RequestedSolutions": 2, # Only works in AlgoMethod heuristic
-         "SolutionIndex": None, # Only works in AlgoMethod index
-         "ApiMethod": "cpp",
-         "RotatingBuffer": 0,
-     }, "TuningParameters": {
-         "SplitK": [0]
-     }, "ProblemSizes": []}
-     defaultCreateLogicOptions = {}  # Currently unused
+     defaultBenchOptions = {"ProblemType": {
+         "TransposeA": 0,
+         "TransposeB": 0,
+         "ComputeInputDataType": "s",
+         "ComputeDataType": "s",
+         "DataTypeC": "s",
+         "DataTypeD": "s",
+         "UseBias": False
+     }, "TestConfig": {
+         "ColdIter": 20,
+         "Iter": 100,
+         "AlgoMethod": "all",
+         "RequestedSolutions": 2, # Only works in AlgoMethod heuristic
+         "SolutionIndex": None, # Only works in AlgoMethod index
+         "ApiMethod": "cpp",
+         "RotatingBuffer": 0,
+     }, "TuningParameters": {
+         "SplitK": [0]
+     }, "ProblemSizes": []}
+     defaultCreateLogicOptions = {}  # Currently unused

 * ``TestConfig``
   1. ``ColdIter``: This is number the warm-up iterations before starting the kernel benchmark.
@@ -870,7 +1230,7 @@ command:

 .. code-block:: shell

-   merge.py original_dir new_tuned_yaml_dir output_dir 
+   merge.py original_dir new_tuned_yaml_dir output_dir 

 The following table describes the logic YAML files.

@@ -1091,7 +1451,7 @@ you can only use a fraction of the potential bandwidth on the node.
 The following figure shows an
 :doc:`MI300X node-level architecture </conceptual/gpu-arch/mi300>` of a
 system with AMD EPYC processors in a dual-socket configuration and eight
-AMD Instinct MI300X GPUs. The MI300X OAMs attach to the host system via
+AMD Instinct MI300X accelerators. The MI300X OAMs attach to the host system via
 PCIe Gen 5 x16 links (yellow lines). The GPUs use seven high-bandwidth,
 low-latency AMD Infinity Fabric™ links (red lines) to form a fully connected
 8-GPU system.
@@ -1100,7 +1460,7 @@ low-latency AMD Infinity Fabric™ links (red lines) to form a fully connected

 .. figure:: ../../../data/shared/mi300-node-level-arch.png

-   MI300 Series node-level architecture showing 8 fully interconnected MI300X
+   MI300 series node-level architecture showing 8 fully interconnected MI300X
   OAM modules connected to (optional) PCIe switches via re-timers and HGX
   connectors.

@@ -1293,7 +1653,7 @@ Auto-tunable kernel configuration involves adjusting memory access and computati
 resources assigned to each compute unit. It encompasses the usage of
 :ref:`LDS <mi300x-cu-fig>`, register, and task scheduling on a compute unit.

-The GPU contains global memory, local data share (LDS), and
+The accelerator or GPU contains global memory, local data share (LDS), and
 registers. Global memory has high access latency, but is large. LDS access has
 much lower latency, but is smaller. It is a fast on-CU software-managed memory
 that can be used to efficiently share data between all work items in a block.
@@ -1306,11 +1666,11 @@ Register access is the fastest yet smallest among the three.
   Schematic representation of a CU in the CDNA2 or CDNA3 architecture.

 The following is a list of kernel arguments used for tuning performance and
-resource allocation on AMD GPUs, which helps in optimizing the
+resource allocation on AMD accelerators, which helps in optimizing the
 efficiency and throughput of various computational kernels.

 ``num_stages=n``
-   Adjusts the number of pipeline stages for different types of kernels. On AMD GPUs, set ``num_stages``
+   Adjusts the number of pipeline stages for different types of kernels. On AMD accelerators, set ``num_stages``
   according to the following rules:

   * For kernels with a single GEMM, set to ``2``.
@@ -1337,15 +1697,15 @@ efficiency and throughput of various computational kernels.
   * The occupancy of the kernel is limited by VGPR usage, and

   * The current VGPR usage is only a few above a boundary in
-     :ref:`Occupancy related to VGPR usage in an Instinct MI300X GPU <mi300x-occupancy-vgpr-table>`.
+     :ref:`Occupancy related to VGPR usage in an Instinct MI300X accelerator <mi300x-occupancy-vgpr-table>`.

 .. _mi300x-occupancy-vgpr-table:

 .. figure:: ../../../data/shared/occupancy-vgpr.png
-   :alt: Occupancy related to VGPR usage in an Instinct MI300X GPU.
+   :alt: Occupancy related to VGPR usage in an Instinct MI300X accelerator.
   :align: center

-   Occupancy related to VGPRs usage on an Instinct MI300X GPU
+   Occupancy related to VGPRs usage on an Instinct MI300X accelerator

 For example, according to the table, each Execution Unit (EU) has 512 available
 VGPRs, which are allocated in blocks of 16. If the current VGPR usage is 170,
@@ -1370,7 +1730,7 @@ VGPR usage so that it might fit 3 waves per EU.

   -  ``matrix_instr_nonkdim = 32``: ``mfma_32x32`` is used.

-   For GEMM kernels on an MI300X GPU, ``mfma_16x16`` typically outperforms ``mfma_32x32``, even for large
+   For GEMM kernels on an MI300X accelerator, ``mfma_16x16`` typically outperforms ``mfma_32x32``, even for large
   tile/GEMM sizes.


@@ -1389,7 +1749,7 @@ the number of CUs a kernel can distribute its task across.

   XCD-level system architecture showing 40 compute units,
   each with 32 KB L1 cache, a unified compute system with 4 ACE compute
-   GPUs, shared 4MB of L2 cache, and a hardware scheduler (HWS).
+   accelerators, shared 4MB of L2 cache, and a hardware scheduler (HWS).

 You can query hardware resources with the command ``rocminfo`` in the
 ``/opt/rocm/bin`` directory. For instance, query the number of CUs, number of
@@ -1473,7 +1833,7 @@ de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.

 From the IR snippet, you can see ``i32`` data is loaded from global memory to
 registers (``%190``). With a few element-wise operations in registers, it is
-stored in shared memory (``%269``) for the transpose operation (``%270``), which
+stored in shared memory (``%269``) for the transpose operation (``%270``), which
 needs data movement across different threads. With the transpose done, it is
 loaded from LDS to register again (``%276``), and with a few more
 element-wise operations, it is stored to LDS again (``%298``). The last step
@@ -1607,7 +1967,7 @@ something similar to the following:
   loaded at: [0x7fd4f100c000-0x7fd4f100e070]

 The kernel name and the code object file should be listed. In the
-example above, the kernel name is vector_add_assert_trap, but this might
+example above, the kernel name is vector_add_assert_trap, but this might
 also look like:

 .. code-block:: text
@@ -1721,8 +2081,3 @@ Hardware efficiency is maximized with 4 or fewer HIP streams. These environment
 configuration to two compute streams and two RCCL streams, aligning with this best practice.
 Additionally, RCCL is often pre-optimized for MI300 systems in production by querying the node
 topology during startup, reducing the need for extensive manual tuning.
-
-Further reading
-===============
-
-* :doc:`vllm-optimization`
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -23,9 +23,9 @@ vLLM inference performance testing

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series GPUs. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   GPUs and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   .. list-table::
      :header-rows: 1
@@ -47,7 +47,7 @@ vLLM inference performance testing

 With this Docker image, you can quickly test the :ref:`expected
 inference performance numbers <vllm-benchmark-performance-measurements-812>` for
-MI300X Series GPUs.
+MI300X series accelerators.

 What's new
 ==========
@@ -139,7 +139,7 @@ page provides reference throughput and serving measurements for inferencing popu
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

 System validation
 =================
@@ -424,7 +424,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909.rst
@@ -1,448 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker-909:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-
-   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series accelerators. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   accelerators and includes the following components:
-
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components.items() %}
-      * - {{ component_name }}
-        - {{ component_version }}
-      {% endfor %}
-
-With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-909>` for
-MI300X Series accelerators.
-
-What's new
-==========
-
-The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
-
-* Upgraded to vLLM v0.10.1.
-
-* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.
-
-* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.
-
-.. _vllm-benchmark-supported-models-909:
-
-Supported models
-================
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   .. _vllm-benchmark-available-models-909:
-
-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-   .. _vllm-benchmark-vllm-909:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{ model.mad_tag }}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
-         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
-      {% endif %}
-
-      {% endfor %}
-   {% endfor %}
-
-.. _vllm-benchmark-performance-measurements-909:
-
-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and serving measurements for inferencing popular AI models.
-
-.. important::
-
-   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   Pull the Docker image
-   =====================
-
-   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad-909:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            The following run command is tailored to {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
-
-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
-
-            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{model.mad_tag}} \
-                      --keep-model-dir \
-                      --live-output \
-                      --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
-            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
-            and ``{{ model.mad_tag }}_serving.csv``.
-
-            Although the :ref:`available models
-            <vllm-benchmark-available-models-909>` are preconfigured to collect
-            offline throughput and online serving performance data, you can
-            also change the benchmarking parameters. See the standalone
-            benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
-               the ``--tunableop on`` argument in your run.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
-               performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            The following commands are optimized for {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
-
-            .. seealso::
-
-               For more information on configuration, see the `config files
-               <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
-               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
-               for descriptions of available configuration options
-               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
-               additional benchmarking information.
-
-            .. rubric:: Launch the container
-
-            You can run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
-            in the following snippet.
-
-            .. code-block:: shell
-
-               docker pull {{ docker.pull_tag }}
-               docker run -it \
-                   --device=/dev/kfd \
-                   --device=/dev/dri \
-                   --group-add video \
-                   --shm-size 16G \
-                   --security-opt seccomp=unconfined \
-                   --security-opt apparmor=unconfined \
-                   --cap-add=SYS_PTRACE \
-                   -v $(pwd):/workspace \
-                   --env HUGGINGFACE_HUB_CACHE=/workspace \
-                   --name test \
-                   {{ docker.pull_tag }}
-
-            .. rubric:: Throughput command
-
-            Use the following command to start the throughput benchmark.
-
-            .. code-block:: shell
-
-               model={{ model.model_repo }}
-               tp={{ model.config.tp }}
-               num_prompts=1024
-               in=128
-               out=128
-               dtype={{ model.config.dtype }}
-               kv_cache_dtype={{ model.config.kv_cache_dtype }}
-               max_num_seqs=1024
-               max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
-               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-               max_model_len={{ model.config.max_model_len }}
-
-               vllm bench throughput --model $model \
-                   -tp $tp \
-                   --num-prompts $num_prompts \
-                   --input-len $in \
-                   --output-len $out \
-                   --dtype $dtype \
-                   --kv-cache-dtype $kv_cache_dtype \
-                   --max-num-seqs $max_num_seqs \
-                   --max-seq-len-to-capture $max_seq_len_to_capture \
-                   --max-num-batched-tokens $max_num_batched_tokens \
-                   --max-model-len $max_model_len \
-                   --trust-remote-code \
-                   --output-json ${model}_throughput.json \
-                   --gpu-memory-utilization 0.9
-
-            .. rubric:: Serving command
-
-            1. Start the server using the following command:
-
-               .. code-block:: shell
-
-                  model={{ model.model_repo }}
-                  tp={{ model.config.tp }}
-                  dtype={{ model.config.dtype }}
-                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
-                  max_num_seqs=256
-                  max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
-                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-                  max_model_len={{ model.config.max_model_len }}
-
-                  vllm serve $model \
-                      -tp $tp \
-                      --dtype $dtype \
-                      --kv-cache-dtype $kv_cache_dtype \
-                      --max-num-seqs $max_num_seqs \
-                      --max-seq-len-to-capture $max_seq_len_to_capture \
-                      --max-num-batched-tokens $max_num_batched_tokens \
-                      --max-model-len $max_model_len \
-                      --no-enable-prefix-caching \
-                      --swap-space 16 \
-                      --disable-log-requests \
-                      --trust-remote-code \
-                      --gpu-memory-utilization 0.9
-
-               Wait until the model has loaded and the server is ready to accept requests.
-
-            2. On another terminal on the same machine, run the benchmark:
-
-               .. code-block:: shell
-
-                  # Connect to the container
-                  docker exec -it test bash
-
-                  # Wait for the server to start
-                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
-
-                  # Run the benchmark
-                  model={{ model.model_repo }}
-                  max_concurrency=1
-                  num_prompts=10
-                  in=128
-                  out=128
-                  vllm bench serve --model $model \
-                      --percentile-metrics "ttft,tpot,itl,e2el" \
-                      --dataset-name random \
-                      --ignore-eos \
-                      --max-concurrency $max_concurrency \
-                      --num-prompts $num_prompts \
-                      --random-input-len $in \
-                      --random-output-len $out \
-                      --trust-remote-code \
-                      --save-result \
-                      --result-filename ${model}_serving.json
-
-            .. note::
-
-               For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
-               try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Advanced usage
-==============
-
-For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
-
-Reproducing the Docker image
----------------------------
-
-To reproduce this ROCm/vLLM Docker image release, follow these steps:
-
-1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/vllm.git
-
-2. Checkout the specific release commit.
-
-   .. code-block:: shell
-
-      cd vllm
-      git checkout 6663000a391911eba96d7864a26ac42b07f6ef29
-
-3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
-
-   .. code-block:: shell
-
-      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
-
-Further reading
-===============
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X Series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-  a brief introduction to vLLM and optimization strategies.
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.2-20251006.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.2-20251006.rst
@@ -1,482 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker-930:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-
-   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a
-   prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X
-   GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored
-   specifically for AMD data center GPUs and includes the following components:
-
-   .. tab-set::
-
-      .. tab-item:: {{ docker.pull_tag }}
-
-         .. list-table::
-            :header-rows: 1
-
-            * - Software component
-              - Version
-
-            {% for component_name, component_version in docker.components.items() %}
-            * - {{ component_name }}
-              - {{ component_version }}
-            {% endfor %}
-
-With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-930>` for
-AMD Instinct GPUs.
-
-What's new
-==========
-
-The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
-
-* Added support for AMD Instinct MI355X and MI350X GPUs.
-
-* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`.
-
-  * Llama 4 Scout and Maverick
-
-  * DeepSeek R1 0528 FP8
-
-  * MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4
-
-  * GPT OSS 20B and 120B
-
-  * Qwen 3 32B, 30B-A3B, and 235B-A22B
-
-* Removed the deprecated ``--max-seq-len-to-capture`` flag.
-
-* ``--gpu-memory-utilization`` is now configurable via the `configuration files
-  <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__ in the MAD
-  repository.
-
-.. _vllm-benchmark-supported-models-930:
-
-Supported models
-================
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   .. _vllm-benchmark-available-models-930:
-
-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started. MXFP4 models
-   are only supported on MI355X and MI350X GPUs.
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-   .. _vllm-benchmark-vllm-930:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{ model.mad_tag }}
-
-
-      {% if model.precision == "float4" %}
-      .. important::
-
-         MXFP4 is supported only on MI355X and MI350X GPUs.
-      {% endif %}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
-         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
-      {% endif %}
-      {% if model.precision == "float4" and model.model_repo.startswith("amd") %}
-         This model uses FP4 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
-      {% endif %}
-
-      {% endfor %}
-   {% endfor %}
-
-.. _vllm-benchmark-performance-measurements-930:
-
-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and serving measurements for inferencing popular AI models.
-
-.. important::
-
-   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software.
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-Pull the Docker image
-=====================
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-
-   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ docker.pull_tag }}
-
-Benchmarking
-============
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad-930:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            The following run command is tailored to {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
-
-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
-
-            2. On the host machine, use this command to run the performance benchmark test on
-               the `{{model.model}} <{{ model.url }}>`_ model using one node with the
-               :literal:`{{model.precision}}` data type.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{model.mad_tag}} \
-                      --keep-model-dir \
-                      --live-output
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
-            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
-            and ``{{ model.mad_tag }}_serving.csv``.
-
-            Although the :ref:`available models
-            <vllm-benchmark-available-models-930>` are preconfigured to collect
-            offline throughput and online serving performance data, you can
-            also change the benchmarking parameters. See the standalone
-            benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
-               the ``--tunableop on`` argument in your run.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
-               performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            The following commands are optimized for {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
-
-            .. seealso::
-
-               For more information on configuration, see the `config files
-               <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
-               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
-               for descriptions of available configuration options
-               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
-               additional benchmarking information.
-
-            .. rubric:: Launch the container
-
-            You can run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
-            in the following snippet.
-
-            .. code-block:: shell
-
-               docker pull {{ docker.pull_tag }}
-               docker run -it \
-                   --device=/dev/kfd \
-                   --device=/dev/dri \
-                   --group-add video \
-                   --shm-size 16G \
-                   --security-opt seccomp=unconfined \
-                   --security-opt apparmor=unconfined \
-                   --cap-add=SYS_PTRACE \
-                   -v $(pwd):/workspace \
-                   --env HUGGINGFACE_HUB_CACHE=/workspace \
-                   --name test \
-                   {{ docker.pull_tag }}
-
-            .. rubric:: Throughput command
-
-            Use the following command to start the throughput benchmark.
-
-            .. code-block:: shell
-
-               model={{ model.model_repo }}
-               tp={{ model.config.tp }}
-               num_prompts={{ model.config.num_prompts | default(1024) }}
-               in={{ model.config.in | default(128) }}
-               out={{ model.config.in | default(128) }}
-               dtype={{ model.config.dtype | default("auto") }}
-               kv_cache_dtype={{ model.config.kv_cache_dtype }}
-               max_num_seqs={{ model.config.max_num_seqs | default(1024) }}
-               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-               max_model_len={{ model.config.max_model_len }}
-
-               vllm bench throughput --model $model \
-                   -tp $tp \
-                   --num-prompts $num_prompts \
-                   --input-len $in \
-                   --output-len $out \
-                   --dtype $dtype \
-                   --kv-cache-dtype $kv_cache_dtype \
-                   --max-num-seqs $max_num_seqs \
-                   --max-num-batched-tokens $max_num_batched_tokens \
-                   --max-model-len $max_model_len \
-                   --trust-remote-code \
-                   --output-json ${model}_throughput.json \
-                   --gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }}
-
-            .. rubric:: Serving command
-
-            1. Start the server using the following command:
-
-               .. code-block:: shell
-
-                  model={{ model.model_repo }}
-                  tp={{ model.config.tp }}
-                  dtype={{ model.config.dtype }}
-                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
-                  max_num_seqs=256
-                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-                  max_model_len={{ model.config.max_model_len }}
-
-                  vllm serve $model \
-                      -tp $tp \
-                      --dtype $dtype \
-                      --kv-cache-dtype $kv_cache_dtype \
-                      --max-num-seqs $max_num_seqs \
-                      --max-num-batched-tokens $max_num_batched_tokens \
-                      --max-model-len $max_model_len \
-                      --no-enable-prefix-caching \
-                      --swap-space 16 \
-                      --disable-log-requests \
-                      --trust-remote-code \
-                      --gpu-memory-utilization 0.9
-
-               Wait until the model has loaded and the server is ready to accept requests.
-
-            2. On another terminal on the same machine, run the benchmark:
-
-               .. code-block:: shell
-
-                  # Connect to the container
-                  docker exec -it test bash
-
-                  # Wait for the server to start
-                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
-
-                  # Run the benchmark
-                  model={{ model.model_repo }}
-                  max_concurrency=1
-                  num_prompts=10
-                  in=128
-                  out=128
-                  vllm bench serve --model $model \
-                      --percentile-metrics "ttft,tpot,itl,e2el" \
-                      --dataset-name random \
-                      --ignore-eos \
-                      --max-concurrency $max_concurrency \
-                      --num-prompts $num_prompts \
-                      --random-input-len $in \
-                      --random-output-len $out \
-                      --trust-remote-code \
-                      --save-result \
-                      --result-filename ${model}_serving.json
-
-            .. note::
-
-               For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
-               try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Advanced usage
-==============
-
-For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
-
-Reproducing the Docker image
----------------------------
-
-To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
-
-1. Clone the `vLLM repository <https://github.com/vllm-project/vllm>`__.
-
-   .. code-block:: shell
-
-      git clone https://github.com/vllm-project/vllm.git
-      cd vllm
-
-2. Use the following command to build the image directly from the specified commit.
-
-   .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
-
-      {% set docker = data.dockers[0] %}
-      .. code-block:: shell
-
-         docker build -f docker/Dockerfile.rocm \
-             --build-arg REMOTE_VLLM=1 \
-             --build-arg VLLM_REPO=https://github.com/ROCm/vllm \
-             --build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \
-             -t vllm-rocm .
-
-   .. tip::
-
-      Replace ``vllm-rocm`` with your desired image tag.
-
-Further reading
-===============
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-  a brief introduction to vLLM and optimization strategies.
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the unified
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
                 ROCm Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -18,9 +18,9 @@ vLLM inference performance testing

 The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
 a prebuilt, optimized environment designed for validating large language model
-(LLM) inference performance on the AMD Instinct™ MI300X GPU. This
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
 ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
-MI300X GPU and includes the following components:
+MI300X accelerator and includes the following components:

 * `ROCm 6.2.0 <https://github.com/ROCm/ROCm>`_

@@ -31,7 +31,7 @@ MI300X GPU and includes the following components:
 * Tuning files (in CSV format)

 With this Docker image, you can quickly validate the expected inference
-performance numbers on the MI300X GPU. This topic also provides tips on
+performance numbers on the MI300X accelerator. This topic also provides tips on
 optimizing performance with popular AI models.

 .. _vllm-benchmark-vllm:
@@ -51,7 +51,7 @@ Getting started
 ===============

 Use the following procedures to reproduce the benchmark results on an
-MI300X GPU with the prebuilt vLLM Docker image.
+MI300X accelerator with the prebuilt vLLM Docker image.

 .. _vllm-benchmark-get-started:

@@ -267,7 +267,7 @@ Options

 .. _vllm-benchmark-run-benchmark-v043:

-Running the benchmark on the MI300X GPU
+Running the benchmark on the MI300X accelerator
 -----------------------------------------------

 Here are some examples of running the benchmark with various options.
@@ -328,7 +328,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - To learn how to run community models from Hugging Face on AMD GPUs, see
  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the unified
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
                 ROCm Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -18,9 +18,9 @@ vLLM inference performance testing

 The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
 a prebuilt, optimized environment designed for validating large language model
-(LLM) inference performance on the AMD Instinct™ MI300X GPU. This
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
 ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
-MI300X GPU and includes the following components:
+MI300X accelerator and includes the following components:

 * `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_

@@ -31,7 +31,7 @@ MI300X GPU and includes the following components:
 * Tuning files (in CSV format)

 With this Docker image, you can quickly validate the expected inference
-performance numbers on the MI300X GPU. This topic also provides tips on
+performance numbers on the MI300X accelerator. This topic also provides tips on
 optimizing performance with popular AI models.

 .. hlist::
@@ -74,7 +74,7 @@ Getting started
 ===============

 Use the following procedures to reproduce the benchmark results on an
-MI300X GPU with the prebuilt vLLM Docker image.
+MI300X accelerator with the prebuilt vLLM Docker image.

 .. _vllm-benchmark-get-started:

@@ -332,7 +332,7 @@ Options

 .. _vllm-benchmark-run-benchmark-v064:

-Running the benchmark on the MI300X GPU
+Running the benchmark on the MI300X accelerator
 -----------------------------------------------

 Here are some examples of running the benchmark with various options.
@@ -398,7 +398,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - To learn how to run community models from Hugging Face on AMD GPUs, see
  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -18,9 +18,9 @@ LLM inference performance validation on AMD Instinct MI300X

 The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
 a prebuilt, optimized environment for validating large language model (LLM)
-inference performance on the AMD Instinct™ MI300X GPU. This ROCm vLLM
+inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
 Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
-GPU and includes the following components:
+accelerator and includes the following components:

 * `ROCm 6.3.1 <https://github.com/ROCm/ROCm>`_

@@ -29,7 +29,7 @@ GPU and includes the following components:
 * `PyTorch 2.7.0 (2.7.0a0+git3a58512) <https://github.com/pytorch/pytorch>`_

 With this Docker image, you can quickly validate the expected inference
-performance numbers for the MI300X GPU. This topic also provides tips on
+performance numbers for the MI300X accelerator. This topic also provides tips on
 optimizing performance with popular AI models. For more information, see the lists of
 :ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-v066-models>`
 and :ref:`standalone benchmarking <vllm-benchmark-standalone-v066-options>`.
@@ -47,7 +47,7 @@ Getting started
 ===============

 Use the following procedures to reproduce the benchmark results on an
-MI300X GPU with the prebuilt vLLM Docker image.
+MI300X accelerator with the prebuilt vLLM Docker image.

 .. _vllm-benchmark-get-started:

@@ -377,7 +377,7 @@ Options and available models

 .. _vllm-benchmark-run-benchmark-v066:

-Running the benchmark on the MI300X GPU
+Running the benchmark on the MI300X accelerator
 -----------------------------------------------

 Here are some examples of running the benchmark with various options.
@@ -443,7 +443,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - To learn how to run community models from Hugging Face on AMD GPUs, see
  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
--- a/Show More
+++ b/Show More