Fix typo

2026-01-10 15:18:11 -05:00 · 2025-09-03 19:38:36 +00:00
120 changed files with 4148 additions and 15216 deletions
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -79,7 +79,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - task: Bash@3
      displayName: Add lit to PATH
      inputs:
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -131,7 +131,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -212,7 +212,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: ROCR-Runtime
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -64,10 +45,6 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ROCR_Runtime_build_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -88,18 +65,14 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -109,112 +82,105 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ROCR_Runtime_build_${{ job.os }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        parameters:
-          runRocminfo: false
-      - task: Bash@3
-        displayName: Build kfdtest
-        inputs:
-          targetType: 'inline'
-          workingDirectory: $(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest
-          script: |
-            if [ -e /opt/rh/gcc-toolset-14/enable ]; then
-              source /opt/rh/gcc-toolset-14/enable
-            fi
-            mkdir build && cd build
-            cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm ..
-            make
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: kfdtest
-          testExecutable: BIN_DIR=$(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
-          testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
-          testDir: $(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest/scripts
-          os: ${{ job.os }}
-      - task: Bash@3
-        displayName: Build rocrtst
-        inputs:
-          targetType: 'inline'
-          workingDirectory: $(Agent.BuildDirectory)/s/rocrtst/suites/test_common
-          script: |
-            echo $(Agent.BuildDirectory)/s/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-            sudo cat /etc/ld.so.conf.d/rocm-ci.conf
-            sudo ldconfig -v
-            ldconfig -p
-            if [ -e /opt/rh/gcc-toolset-14/enable ]; then
-              source /opt/rh/gcc-toolset-14/enable
-            fi
-            BASE_CLANG_DIR=$(Agent.BuildDirectory)/rocm/llvm/lib/clang
-            export NEWEST_CLANG_VER=$(ls -1 $BASE_CLANG_DIR | sort -V | tail -n 1)
-            mkdir build && cd build
-            cmake .. \
-              -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm \
-              -DTARGET_DEVICES=${{ job.target }} \
-              -DROCM_DIR=$(Agent.BuildDirectory)/rocm \
-              -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm/bin \
-              -DOPENCL_INC_DIR=$BASE_CLANG_DIR/$NEWEST_CLANG_VER/include
-            make
-            make rocrtst_kernels
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: rocrtst
-          testExecutable: ./rocrtst64
-          testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
-          testDir: $(Agent.BuildDirectory)/s//rocrtst/suites/test_common/build/${{ job.target }}
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-    # docker image will be missing libhwloc5
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ROCR_Runtime_build_${{ job.os }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      parameters:
+        runRocminfo: false
+    - task: Bash@3
+      displayName: Build kfdtest
+      inputs:
+        targetType: 'inline'
+        workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
+        script: |
+          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
+            source /opt/rh/gcc-toolset-14/enable
+          fi
+          mkdir build && cd build
+          cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm ..
+          make
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: kfdtest
+        testExecutable: BIN_DIR=$(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
+        testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
+        testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
+        os: ${{ job.os }}
+    - task: Bash@3
+      displayName: Build rocrtst
+      inputs:
+        targetType: 'inline'
+        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
+        script: |
+          echo $(Build.SourcesDirectory)/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+          sudo cat /etc/ld.so.conf.d/rocm-ci.conf
+          sudo ldconfig -v
+          ldconfig -p
+          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
+            source /opt/rh/gcc-toolset-14/enable
+          fi
+          BASE_CLANG_DIR=$(Agent.BuildDirectory)/rocm/llvm/lib/clang
+          export NEWEST_CLANG_VER=$(ls -1 $BASE_CLANG_DIR | sort -V | tail -n 1)
+          mkdir build && cd build
+          cmake .. \
+            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm \
+            -DTARGET_DEVICES=${{ job.target }} \
+            -DROCM_DIR=$(Agent.BuildDirectory)/rocm \
+            -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm/bin \
+            -DOPENCL_INC_DIR=$BASE_CLANG_DIR/$NEWEST_CLANG_VER/include
+          make
+          make rocrtst_kernels
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocrtst
+        testExecutable: ./rocrtst64
+        testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
+        testDir: $(Build.SourcesDirectory)/rocrtst/suites/test_common/build/${{ job.target }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
+  # docker image will be missing libhwloc5
--- a/.azuredevops/components/aqlprofile.yml
+++ b/.azuredevops/components/aqlprofile.yml
@@ -1,174 +0,0 @@
-parameters:
- name: componentName
-  type: string
-  default: aqlprofile
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
-# set to true if doing full build of ROCm stack
-# and dependencies are pulled from same pipeline
- name: aggregatePipeline
-  type: boolean
-  default: false
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-    - python3-pip
- name: rocmDependencies
-  type: object
-  default:
-    - clr
-    - llvm-project
-    - ROCR-Runtime
- name: rocmTestDependencies
-  type: object
-  default:
-    - clr
-    - llvm-project
-    - ROCR-Runtime
-    - rocprofiler-register
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/aqlprofile/cmake_modules
-          -DAQLPROFILE_BUILD_TESTS=ON
-          -DGPU_TARGETS=${{ job.target }}
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          gpuTarget: ${{ job.target }}
-
- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/rocm/share/hsa-amd-aqlprofile/
-          testExecutable: ./run_tests.sh
-          testParameters: ''
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hip-tests.yml
+++ b/.azuredevops/components/hip-tests.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: hip-tests
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -79,10 +60,6 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: hip_tests_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -99,18 +76,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    # compile hip-tests
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
+        componentName: hip-tests
        cmakeSourceDir: '../catch'
        customBuildTarget: build_tests
        extraBuildFlags: >-
@@ -122,12 +96,9 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -137,56 +108,52 @@ jobs:
        extraEnvVars:
          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: hip_tests_test_${{ job.target }}
-      timeoutInMinutes: 240
-      dependsOn: hip_tests_build_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - task: Bash@3
-        displayName: Symlink rocm_agent_enumerator
-        inputs:
-          targetType: inline
-          script: |
-            # Assuming that /opt is no longer persistent across runs, test environments are fully ephemeral
-            sudo mkdir -p /opt/rocm/bin
-            sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rocm_agent_enumerator /opt/rocm/bin/rocm_agent_enumerator
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/rocm/share/hip
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-          optSymLink: true
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: hip_tests_test_${{ job.target }}
+    timeoutInMinutes: 240
+    dependsOn: hip_tests_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - task: Bash@3
+      displayName: Symlink rocm_agent_enumerator
+      inputs:
+        targetType: inline
+        script: |
+          # Assuming that /opt is no longer persistent across runs, test environments are fully ephemeral
+          sudo mkdir -p /opt/rocm/bin
+          sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rocm_agent_enumerator /opt/rocm/bin/rocm_agent_enumerator
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: hip_tests
+        testDir: $(Agent.BuildDirectory)/rocm/share/hip
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
+        optSymLink: true
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -77,7 +77,6 @@ parameters:
    - clr
    - hipBLAS-common
    - llvm-project
-    - rocm-cmake
    - rocminfo
    - rocm_smi_lib
    - rocprofiler-register
@@ -145,7 +144,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -179,7 +178,7 @@ jobs:
          mkdir -p $(Agent.BuildDirectory)/temp-deps
          cd $(Agent.BuildDirectory)/temp-deps
          # position-independent LAPACK is required for almalinux8 builds
-          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/sparse/projects/hipblaslt/deps
+          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
          make -j
          sudo make install
    - script: |
@@ -198,8 +197,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
-        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt
-        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt/build
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -40,12 +40,10 @@ parameters:
    - gfortran
    - libgfortran5
    - libopenblas-dev
-    - liblapack-dev
 - name: pipModules
  type: object
  default:
    - joblib
-    - msgpack
 - name: rocmDependencies
  type: object
  default:
@@ -54,7 +52,6 @@ parameters:
    - hipSPARSE
    - llvm-project
    - rocBLAS
-    - rocm-cmake
    - rocm_smi_lib
    - rocminfo
    - rocprofiler-register
@@ -68,7 +65,6 @@ parameters:
    - llvm-project
    - hipBLAS-common
    - hipBLASLt
-    - rocm-cmake
    - rocBLAS
    - rocminfo
    - rocprofiler-register
@@ -112,7 +108,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -128,13 +124,10 @@ jobs:
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-  # NOTE: content between `---` is for transition support between old/new build systems
-  # and should be removed once transition is complete.
-  # -----------------------------
  # Build and install gtest and lapack
  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
  # $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
-    - script: mkdir -p $(Pipeline.Workspace)/deps
+    - script: mkdir $(Pipeline.Workspace)/deps
      displayName: Create temp folder for external dependencies
  # hipSPARSELt already has a CMake script for external deps, so we can just run that
  # https://github.com/ROCm/hipSPARSELt/blob/develop/deps/CMakeLists.txt
@@ -150,35 +143,22 @@ jobs:
    - script: sudo make install
      displayName: Install hipSPARSELt external dependencies
      workingDirectory: $(Pipeline.Workspace)/deps
-  # -----------------------------
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
-        # NOTE: the following options are old build only 
-        # and can be removed after full transition to new build
-        # -DAMDGPU_TARGETS=${{ job.target }}
-        # -DCMAKE_Fortran_COMPILER=f95
-        # -DTensile_LOGIC=
-        # -DTensile_CPU_THREADS=
-        # -DTensile_LIBRARY_FORMAT=msgpack
-        # -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-        # -DBUILD_CLIENTS_TESTS=ON
-        # -DBUILD_USE_LOCAL_TENSILE=OFF
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
-          -DGPU_TARGETS=${{ job.target }}
-          -DAMDGPU_TARGETS=${{ job.target }}
          -DCMAKE_Fortran_COMPILER=f95
+          -DAMDGPU_TARGETS=${{ job.target }}
          -DTensile_LOGIC=
          -DTensile_CPU_THREADS=
          -DTensile_LIBRARY_FORMAT=msgpack
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_CLIENTS_TESTS=ON
          -DBUILD_USE_LOCAL_TENSILE=OFF
-          -DHIPSPARSELT_ENABLE_FETCH=ON
          -GNinja
        ${{ if ne(parameters.sparseCheckoutDir, '') }}:
          cmakeSourceDir: $(Build.SourcesDirectory)/projects/hipsparselt
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -77,7 +77,6 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/llvm
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_BUILD_TYPE=Release
          -DHIPTENSOR_BUILD_TESTS=ON
--- a/.azuredevops/components/hipfort.yml
+++ b/.azuredevops/components/hipfort.yml
@@ -71,7 +71,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -1,251 +0,0 @@
-parameters:
- name: componentName
-  type: string
-  default: origami
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
-# set to true if doing full build of ROCm stack
-# and dependencies are pulled from same pipeline
- name: aggregatePipeline
-  type: boolean
-  default: false
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-    - wget
-    - python3
-    - python3-dev
-    - python3-pip
-    - libgtest-dev
-    - libboost-filesystem-dev
-    - libboost-program-options-dev
- name: pipModules
-  type: object
-  default:
-    - nanobind>=2.0.0
- name: rocmDependencies
-  type: object
-  default:
-    - clr
-    - llvm-project
-    - rocm-cmake
-    - rocminfo
-    - ROCR-Runtime
-    - rocprofiler-register
- name: rocmTestDependencies
-  type: object
-  default:
-    - clr
-    - llvm-project
-    - rocm-cmake
-    - rocminfo
-    - ROCR-Runtime
-    - rocprofiler-register
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
-    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
- name: downstreamComponentMatrix
-  type: object
-  default:
-    - hipBLASLt:
-      name: hipBLASLt
-      sparseCheckoutDir: projects/hipblaslt
-      skipUnifiedBuild: 'false'
-      buildDependsOn:
-        - origami_build
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: origami_build_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: ROCM_PATH
-      value: $(Agent.BuildDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-          -DORIGAMI_BUILD_SHARED_LIBS=ON
-          -DORIGAMI_ENABLE_PYTHON=ON
-          -DORIGAMI_BUILD_TESTING=ON
-          -GNinja
-    - ${{ if ne(job.os, 'almalinux8') }}:
-      - task: PublishPipelineArtifact@1
-        displayName: 'Publish Build Directory Artifact'
-        inputs:
-          targetPath: '$(Agent.BuildDirectory)/s/build'
-          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
-          publishLocation: 'pipeline'
-      - task: PublishPipelineArtifact@1
-        displayName: 'Publish Python Source Artifact'
-        inputs:
-          targetPath: '$(Agent.BuildDirectory)/s/python'
-          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
-          publishLocation: 'pipeline'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-        componentName: ${{ parameters.componentName }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-
- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: origami_test_${{ job.os }}_${{ job.target }}
-      timeoutInMinutes: 120
-      dependsOn: origami_build_${{ job.os }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-      - task: DownloadPipelineArtifact@2
-        displayName: 'Download Build Directory Artifact'
-        inputs:
-          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
-          path: '$(Agent.BuildDirectory)/s/build'
-      - task: DownloadPipelineArtifact@2
-        displayName: 'Download Python Source Artifact'
-        inputs:
-          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
-          path: '$(Agent.BuildDirectory)/s/python'
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
-          testExecutable: './origami-tests'
-          testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
-      - script: |
-          set -e
-          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
-
-          echo "--- Running origami_test.py ---"
-          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
-          
-          echo "--- Running origami_grid_test.py ---"
-          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
-        displayName: 'Run Python Binding Tests'
-        condition: succeeded()
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-
- ${{ if parameters.triggerDownstreamJobs }}:
-  - ${{ each component in parameters.downstreamComponentMatrix }}:
-    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-          buildDependsOn: ${{ component.buildDependsOn }}
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-          triggerDownstreamJobs: true
-          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -83,7 +83,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/rdc.yml
+++ b/.azuredevops/components/rdc.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rdc
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -52,7 +33,6 @@ parameters:
    - clr
    - hipBLAS-common
    - hipBLASLt
-    - hipRAND
    - llvm-project
    - rocBLAS
    - rocm-cmake
@@ -63,7 +43,6 @@ parameters:
    - rocprofiler
    - rocprofiler-register
    - rocprofiler-sdk
-    - rocRAND
    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
@@ -95,11 +74,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }}
+  - job: rdc_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -110,22 +85,16 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.25.0'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
  # Build grpc
    - task: Bash@3
      displayName: 'git clone grpc'
@@ -135,7 +104,6 @@ jobs:
        workingDirectory: $(Build.SourcesDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
        cmakeSourceDir: $(Build.SourcesDirectory)/grpc
        installDir: $(Build.SourcesDirectory)/bin
@@ -149,7 +117,6 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DGRPC_ROOT="$(Build.SourcesDirectory)/bin"
@@ -159,12 +126,9 @@ jobs:
          -DAMDGPU_TARGETS=${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -172,64 +136,60 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        gpuTarget: ${{ job.target }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      - name: ROCM_PATH
-        value: $(Agent.BuildDirectory)/rocm
-      - name: ROCM_DIR
-        value: $(Agent.BuildDirectory)/rocm
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - task: Bash@3
-        displayName: Setup test environment
-        inputs:
-          targetType: inline
-          script: |
-            sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
-            echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
-            sudo ldconfig -v
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - task: Bash@3
-        displayName: Test rdc
-        inputs:
-          targetType: inline
-          script: >-
-            $(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
-            --batch_mode
-            --start_rdcd
-            --unauth_comm
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-          extraPaths: /home/user/workspace/rocm/bin
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rdc_test_${{ job.target }}
+    dependsOn: rdc_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    - name: ROCM_DIR
+      value: $(Agent.BuildDirectory)/rocm
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - task: Bash@3
+      displayName: Setup test environment
+      inputs:
+        targetType: inline
+        script: |
+          sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
+          echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
+          sudo ldconfig -v
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - task: Bash@3
+      displayName: Test rdc
+      inputs:
+        targetType: inline
+        script: >-
+          $(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
+          --batch_mode
+          --start_rdcd
+          --unauth_comm
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
+        extraPaths: /home/user/workspace/rocm/bin
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -70,7 +70,6 @@ parameters:
    - hipBLAS-common
    - hipBLASLt
    - llvm-project
-    - rocm-cmake
    - rocminfo
    - rocprofiler-register
    - rocm_smi_lib
@@ -155,7 +154,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -180,8 +179,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
-        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/rocblas
-        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/rocblas/build
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_BUILD_TYPE=Release
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -8,25 +8,6 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
- name: rocPyDecodeRepo
-  type: string
-  default: rocpydecode_repo
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -75,23 +56,10 @@ parameters:
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
- name: downstreamComponentMatrix
-  type: object
-  default:
-    - rocPyDecode:
-      name: rocPyDecode
-      sparseCheckoutDir: ''
-      skipUnifiedBuild: 'false'
-      buildDependsOn:
-        - rocDecode_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -115,15 +83,12 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -204,15 +169,3 @@ jobs:
        registerROCmPackages: true
        environment: test
        gpuTarget: ${{ job.target }}
-
- ${{ if parameters.triggerDownstreamJobs }}:
-  - ${{ each component in parameters.downstreamComponentMatrix }}:
-    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-        parameters:
-          checkoutRepo: ${{ parameters.rocPyDecodeRepo }}
-          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-          buildDependsOn: ${{ component.buildDependsOn }}
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-          triggerDownstreamJobs: true
-          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -5,22 +5,6 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -63,19 +47,19 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocPyDecode_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -90,20 +74,16 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: 'Save Python Package Paths'
      inputs:
--- a/.azuredevops/components/rocm-core.yml
+++ b/.azuredevops/components/rocm-core.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocm-core
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -46,10 +27,6 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocm_core_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-        dependsOn:
-          - ${{ each build in parameters.buildDependsOn }}:
-            - ${{ build }}_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -73,10 +50,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -90,12 +65,9 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -33,20 +33,16 @@ parameters:
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - hipTensor
    - llvm-project
    - rocBLAS
    - rocFFT
-    - rocJPEG
    - rocPRIM
    - rocprofiler-register
-    - rocprofiler-sdk
    - ROCR-Runtime
    - rocRAND
    - rocSOLVER
    - rocSPARSE
    - rocThrust
-    - rocWMMA
 - name: rocmTestDependencies
  type: object
  default:
@@ -61,22 +57,18 @@ parameters:
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - hipTensor
    - llvm-project
    - rocBLAS
    - rocFFT
    - rocminfo
    - rocPRIM
-    - rocJPEG
    - rocprofiler-register
-    - rocprofiler-sdk
    - ROCR-Runtime
    - rocRAND
    - rocSOLVER
    - rocSPARSE
    - rocThrust
    - roctracer
-    - rocWMMA

 - name: jobMatrix
  type: object
@@ -105,9 +97,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.25.0'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -169,9 +158,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.25.0'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/rocm-libraries.yml
+++ b/.azuredevops/components/rocm-libraries.yml
@@ -102,7 +102,7 @@ jobs:
    workspace:
      clean: all
    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocm_smi_lib
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -51,10 +32,6 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocm_smi_lib_build_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -78,10 +55,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -90,56 +65,51 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
-      dependsOn: rocm_smi_lib_build_${{ job.os }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        parameters:
-          runRocminfo: false
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)'
-          testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
-          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
+    dependsOn: rocm_smi_lib_build_${{ job.os }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      parameters:
+        runRocminfo: false
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocm_smi_lib
+        testDir: '$(Agent.BuildDirectory)'
+        testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
+        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocminfo
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -59,11 +40,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
+  - job: rocminfo_build_${{ job.os }}
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -85,18 +62,14 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -105,71 +78,65 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: rocminfo_test_${{ job.target }}
-      dependsOn: rocminfo_build_${{ job.os }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-          registerROCmPackages: true
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        parameters:
-          runRocminfo: false
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)'
-          testExecutable: './rocm/bin/rocminfo'
-          testParameters: ''
-          testPublishResults: false
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: rocm_agent_enumerator
-          testDir: '$(Agent.BuildDirectory)'
-          testExecutable: './rocm/bin/rocm_agent_enumerator'
-          testParameters: ''
-          testPublishResults: false
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          registerROCmPackages: true
-          environment: test
-          gpuTarget: ${{ job.target }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocminfo_test_${{ job.target }}
+    dependsOn: rocminfo_build_${{ job.os }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+        registerROCmPackages: true
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      parameters:
+        runRocminfo: false
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocminfo
+        testDir: '$(Agent.BuildDirectory)'
+        testExecutable: './rocm/bin/rocminfo'
+        testParameters: ''
+        testPublishResults: false
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocm_agent_enumerator
+        testDir: '$(Agent.BuildDirectory)'
+        testExecutable: './rocm/bin/rocm_agent_enumerator'
+        testParameters: ''
+        testPublishResults: false
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        registerROCmPackages: true
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -55,7 +55,6 @@ parameters:
    - pymongo
    - pyyaml
    - setuptools
-    - sqlalchemy
    - tabulate
    - textual
    - textual_plotext
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocprofiler-sdk
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -92,10 +73,6 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocprofiler_sdk_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -112,7 +89,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -120,8 +96,6 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Add Python site-packages binaries to path
      inputs:
@@ -131,7 +105,6 @@ jobs:
          echo "##vso[task.prependpath]$USER_BASE/bin"
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DROCPROFILER_BUILD_TESTS=ON
@@ -141,12 +114,9 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -156,68 +126,62 @@ jobs:
    #     gpuTarget: ${{ job.target }}
    #     registerROCmPackages: true

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: rocprofiler_sdk_test_${{ job.target }}
-      dependsOn: rocprofiler_sdk_build_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          registerROCmPackages: true
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        parameters:
-          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmDependencies }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - task: Bash@3
-        displayName: Add Python and ROCm binaries to path
-        inputs:
-          targetType: inline
-          script: |
-            USER_BASE=$(python3 -m site --user-base)
-            echo "##vso[task.prependpath]$USER_BASE/bin"
-            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          extraBuildFlags: >-
-            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-            -DROCPROFILER_BUILD_TESTS=ON
-            -DROCPROFILER_BUILD_SAMPLES=ON
-            -DROCPROFILER_BUILD_RELEASE=ON
-            -DGPU_TARGETS=${{ job.target }}
-            -GNinja
-      - template: ${{ variables.CI_TEMPLATE_PATH}}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/build
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-          registerROCmPackages: true
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocprofiler_sdk_test_${{ job.target }}
+    dependsOn: rocprofiler_sdk_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        registerROCmPackages: true
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        gpuTarget: ${{ job.target }}
+    - task: Bash@3
+      displayName: Add Python and ROCm binaries to path
+      inputs:
+        targetType: inline
+        script: |
+          USER_BASE=$(python3 -m site --user-base)
+          echo "##vso[task.prependpath]$USER_BASE/bin"
+          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DROCPROFILER_BUILD_TESTS=ON
+          -DROCPROFILER_BUILD_SAMPLES=ON
+          -DROCPROFILER_BUILD_RELEASE=ON
+          -DGPU_TARGETS=${{ job.target }}
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH}}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocprofiler-sdk
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        environment: test
+        gpuTarget: ${{ job.target }}
+        registerROCmPackages: true
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -6,25 +6,6 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: componentName
-  type: string
-  default: rocprofiler-systems
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -106,10 +87,6 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocprofiler_systems_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-        dependsOn:
-          - ${{ each build in parameters.buildDependsOn }}:
-            - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -128,7 +105,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -160,16 +136,12 @@ jobs:
          -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
          -DGPU_TARGETS=${{ job.target }}
          -GNinja
-        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        gpuTarget: ${{ job.target }}
-        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
@@ -179,93 +151,85 @@ jobs:
        registerROCmPackages: true
        extraPaths: /home/user/workspace/rocm/bin:/home/user/workspace/rocm/llvm/bin

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: rocprofiler_systems_test_${{ job.target }}
-      dependsOn: rocprofiler_systems_build_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      timeoutInMinutes: 180
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      - name: ROCM_PATH
-        value: $(Agent.BuildDirectory)/rocm
-      pool:
-        name: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          registerROCmPackages: true
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmDependencies }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - task: Bash@3
-        displayName: Add ROCm binaries to PATH
-        inputs:
-          targetType: inline
-          script: |
-            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-        parameters:
-          cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
-    # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
-          extraBuildFlags: >-
-            -DCMAKE_INSTALL_PREFIX=$(Agent.BuildDirectory)/rocprofiler-systems
-            -DROCPROFSYS_USE_PYTHON=ON
-            -DROCPROFSYS_BUILD_TESTING=ON
-            -DROCPROFSYS_BUILD_DYNINST=ON
-            -DROCPROFSYS_BUILD_LIBUNWIND=ON
-            -DROCPROFSYS_DISABLE_EXAMPLES="openmp-target"
-            -DDYNINST_BUILD_TBB=ON
-            -DDYNINST_BUILD_ELFUTILS=ON
-            -DDYNINST_BUILD_LIBIBERTY=ON
-            -DDYNINST_BUILD_BOOST=ON
-            -DROCPROFSYS_USE_PAPI=ON
-            -DROCPROFSYS_USE_MPI=ON
-            -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
-            -DGPU_TARGETS=${{ job.target }}
-            -GNinja
-      - task: Bash@3
-        displayName: Set up rocprofiler-systems env
-        inputs:
-          targetType: inline
-          script: source $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems/setup-env.sh
-          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/build/tests/
-          testParameters: '--output-on-failure'
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-        parameters:
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-        parameters:
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          environment: test
-          registerROCmPackages: true
-          gpuTarget: ${{ job.target }}
-          extraPaths: /home/user/workspace/rocm/bin:/home/user/workspace/rocm/llvm/bin
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocprofiler_systems_test_${{ job.target }}
+    dependsOn: rocprofiler_systems_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    timeoutInMinutes: 180
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      name: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        registerROCmPackages: true
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        gpuTarget: ${{ job.target }}
+    - task: Bash@3
+      displayName: Add ROCm binaries to PATH
+      inputs:
+        targetType: inline
+        script: |
+          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+  # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
+        extraBuildFlags: >-
+          -DROCPROFSYS_BUILD_TESTING=ON
+          -DROCPROFSYS_BUILD_DYNINST=ON
+          -DROCPROFSYS_BUILD_LIBUNWIND=ON
+          -DROCPROFSYS_DISABLE_EXAMPLES="openmp-target"
+          -DDYNINST_BUILD_TBB=ON
+          -DDYNINST_BUILD_ELFUTILS=ON
+          -DDYNINST_BUILD_LIBIBERTY=ON
+          -DDYNINST_BUILD_BOOST=ON
+          -DROCPROFSYS_USE_PAPI=ON
+          -DROCPROFSYS_USE_MPI=ON
+          -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
+          -DGPU_TARGETS=${{ job.target }}
+          -GNinja
+    - task: Bash@3
+      displayName: Set up rocprofiler-systems env
+      inputs:
+        targetType: inline
+        script: source share/rocprofiler-systems/setup-env.sh
+        workingDirectory: build
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocprofiler-systems
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        environment: test
+        registerROCmPackages: true
+        gpuTarget: ${{ job.target }}
+        extraPaths: /home/user/workspace/rocm/bin:/home/user/workspace/rocm/llvm/bin
--- a/.azuredevops/dependencies/catch2.yml
+++ b/.azuredevops/dependencies/catch2.yml
@@ -1,63 +0,0 @@
-parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
- name: catch2Version
-  type: string
-  default: ''
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt}
-      - { os: almalinux8, packageManager: dnf}
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: catch2_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - checkout: none
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Clone catch2 ${{ parameters.catch2Version }}
-      inputs:
-        targetType: inline
-        script: git clone https://github.com/catchorg/Catch2.git -b ${{ parameters.catch2Version }}
-        workingDirectory: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        cmakeBuildDir: $(Agent.BuildDirectory)/Catch2/build
-        cmakeSourceDir: $(Agent.BuildDirectory)/Catch2
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
--- a/.azuredevops/dependencies/libdivide.yml
+++ b/.azuredevops/dependencies/libdivide.yml
@@ -1,64 +0,0 @@
-parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
- name: libdivideVersion
-  type: string
-  default: ''
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt}
-      - { os: almalinux8, packageManager: dnf}
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: libdivide_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - checkout: none
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Clone libdivide ${{ parameters.libdivideVersion }}
-      inputs:
-        targetType: inline
-        script: git clone https://github.com/ridiculousfish/libdivide.git -b ${{ parameters.libdivideVersion }}
-        workingDirectory: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        cmakeBuildDir: $(Agent.BuildDirectory)/libdivide/build
-        cmakeSourceDir: $(Agent.BuildDirectory)/libdivide
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -DLIBDIVIDE_BUILD_TESTS=OFF
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
--- a/.azuredevops/tag-builds/catch2.yml
+++ b/.azuredevops/tag-builds/catch2.yml
@@ -1,23 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml
-
-parameters:
- name: catch2Version
-  type: string
-  default: "v3.7.0"
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-
-trigger: none
-pr: none
-
-jobs:
-  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/catch2.yml
-    parameters:
-      catch2Version: ${{ parameters.catch2Version }}
--- a/.azuredevops/tag-builds/libdivide.yml
+++ b/.azuredevops/tag-builds/libdivide.yml
@@ -1,23 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml
-
-parameters:
- name: libdivideVersion
-  type: string
-  default: master
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-
-trigger: none
-pr: none
-
-jobs:
-  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/libdivide.yml
-    parameters:
-      libdivideVersion: ${{ parameters.libdivideVersion }}
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -20,7 +20,7 @@ steps:
    retryCountOnTaskFailure: 3
    fetchFilter: blob:none
    ${{ if ne(parameters.sparseCheckoutDir, '') }}:
-      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }} shared
+      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
      path: sparse
  - ${{ if ne(parameters.sparseCheckoutDir, '') }}:
    - task: Bash@3
--- a/.azuredevops/templates/steps/dependencies-cmake-latest.yml
+++ b/.azuredevops/templates/steps/dependencies-cmake-latest.yml
@@ -1,15 +1,10 @@
-parameters:
-  - name: cmakeVersion
-    type: string
-    default: '3.31.0'
-
 steps:
 - task: Bash@3
-  displayName: Install CMake ${{ parameters.cmakeVersion }}
+  displayName: Install CMake 3.31
  inputs:
    targetType: inline
    script: |
-      CMAKE_VERSION=${{ parameters.cmakeVersion }}
+      CMAKE_VERSION=3.31.0
      CMAKE_ROOT="$(Pipeline.Workspace)/cmake"

      echo "Downloading CMake $CMAKE_VERSION..."
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -46,10 +46,6 @@ parameters:
      pipelineId: 115
      developBranch: aomp-dev
      hasGpuTarget: false
-    aqlprofile:
-      pipelineId: 365
-      developBranch: develop
-      hasGpuTarget: false
    clr:
      pipelineId: 335
      developBranch: develop
@@ -67,8 +63,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: false
    hip-tests:
-      pipelineId: 362
-      developBranch: develop
+      pipelineId: 233
+      developBranch: amd-staging
      hasGpuTarget: false
    hipBLAS:
      pipelineId: 317
@@ -130,17 +126,13 @@ parameters:
      pipelineId: 80
      developBranch: develop
      hasGpuTarget: true
-    origami:
-      pipelineId: 364
-      developBranch: develop
-      hasGpuTarget: true
    rccl:
      pipelineId: 107
      developBranch: develop
      hasGpuTarget: true
    rdc:
-      pipelineId: 360
-      developBranch: develop
+      pipelineId: 100
+      developBranch: amd-staging
      hasGpuTarget: false
    rocAL:
      pipelineId: 151
@@ -179,16 +171,16 @@ parameters:
      developBranch: develop
      hasGpuTarget: false
    rocm-core:
-      pipelineId: 349
-      developBranch: develop
+      pipelineId: 103
+      developBranch: master
      hasGpuTarget: false
    rocm-examples:
      pipelineId: 216
      developBranch: amd-staging
      hasGpuTarget: true
    rocminfo:
-      pipelineId: 356
-      developBranch: develop
+      pipelineId: 91
+      developBranch: amd-staging
      hasGpuTarget: false
    rocMLIR:
      pipelineId: 229
@@ -203,8 +195,8 @@ parameters:
      developBranch: master
      hasGpuTarget: false
    rocm_smi_lib:
-      pipelineId: 358
-      developBranch: develop
+      pipelineId: 96
+      developBranch: amd-staging
      hasGpuTarget: false
    rocPRIM:
      pipelineId: 273
@@ -215,7 +207,7 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    rocprofiler-compute:
-      pipelineId: 344
+      pipelineId: 257
      developBranch: develop
      hasGpuTarget: true
    rocprofiler-register:
@@ -223,20 +215,20 @@ parameters:
      developBranch: develop
      hasGpuTarget: false
    rocprofiler-sdk:
-      pipelineId: 347
-      developBranch: develop
+      pipelineId: 246
+      developBranch: amd-staging
      hasGpuTarget: true
    rocprofiler-systems:
-      pipelineId: 345
-      developBranch: develop
+      pipelineId: 255
+      developBranch: amd-staging
      hasGpuTarget: true
    rocPyDecode:
      pipelineId: 239
      developBranch: develop
      hasGpuTarget: true
    ROCR-Runtime:
-      pipelineId: 354
-      developBranch: develop
+      pipelineId: 10
+      developBranch: amd-staging
      hasGpuTarget: false
    rocRAND:
      pipelineId: 274
@@ -259,8 +251,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    roctracer:
-      pipelineId: 331
-      developBranch: develop
+      pipelineId: 141
+      developBranch: amd-staging
      hasGpuTarget: true
    rocWMMA:
      pipelineId: 109
--- a/.azuredevops/templates/steps/dependencies-vendor.yml
+++ b/.azuredevops/templates/steps/dependencies-vendor.yml
@@ -8,13 +8,11 @@ parameters:
  type: object
  default:
    boost: 250
-    catch2: 343
    fmtlib: 341
    grpc: 72
    gtest: 73
    half560: 68
    lapack: 69
-    libdivide: 342
    spdlog: 340

 steps:
@@ -33,7 +31,7 @@ steps:
    inputs:
      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
      destinationFolder: $(Agent.BuildDirectory)/vendor
-      cleanDestinationFolder: false
+      cleanDestinationFolder: true
      overwriteExistingFiles: true
  - task: DeleteFiles@1
    displayName: Clean up ${{ dependency }}
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -43,7 +43,6 @@ Blit
 Blockwise
 Bluefield
 Bootloader
-Broadcom
 CAS
 CCD
 CDNA
@@ -63,7 +62,6 @@ CPU
 CPUs
 Cron
 CSC
-CSDATA
 CSE
 CSV
 CSn
@@ -73,7 +71,6 @@ CU
 CUDA
 CUs
 CXX
-CX
 Cavium
 CentOS
 ChatGPT
@@ -84,7 +81,6 @@ CommonMark
 Concretized
 Conda
 ConnectX
-CountOnes
 CuPy
 da
 Dashboarding
@@ -101,7 +97,6 @@ DIMM
 DKMS
 DL
 DMA
-DOMContentLoaded
 DNN
 DNNL
 DPM
@@ -120,8 +115,6 @@ Dependabot
 Deprecations
 DevCap
 DirectX
-Disaggregated
-disaggregated
 Dockerfile
 Dockerized
 Doxygen
@@ -131,10 +124,8 @@ ENDPGM
 EPYC
 ESXi
 EoS
-etcd
 fas
 FBGEMM
-FIFOs
 FFT
 FFTs
 FFmpeg
@@ -147,8 +138,6 @@ Filesystem
 FindDb
 Flang
 FlashAttention
-FlashInfer’s
-FlashInfer
 FluxBenchmark
 Fortran
 Fuyu
@@ -167,7 +156,6 @@ GEMMs
 GFLOPS
 GFortran
 GFXIP
-GGUF
 Gemma
 GiB
 GIM
@@ -185,7 +173,6 @@ GPUs
 Graphbolt
 GraphSage
 GRBM
-GRE
 GenAI
 GenZ
 GitHub
@@ -213,7 +200,6 @@ Higgs
 href
 Hyperparameters
 Huggingface
-IB
 ICD
 ICT
 ICV
@@ -222,11 +208,8 @@ IDEs
 IFWI
 IMDb
 IncDec
-instrSize
-interpolators
 IOMMU
 IOP
-IOPS
 IOPM
 IOV
 IRQ
@@ -263,15 +246,12 @@ LLM
 LLMs
 LLVM
 LM
-LRU
 LSAN
 LSan
 LTS
 LSTMs
-LteAll
 LanguageCrossEntropy
 LoRA
-MECO
 MEM
 MERCHANTABILITY
 MFMA
@@ -290,7 +270,6 @@ MNIST
 MPI
 MPT
 MSVC
-mul
 MVAPICH
 MVFFR
 Makefile
@@ -309,14 +288,11 @@ MirroredStrategy
 Mixtral
 MosaicML
 MoEs
-Mooncake
 Mpops
 Multicore
 Multithreaded
-MXFP
 MyEnvironment
 MyST
-NANOO
 NBIO
 NBIOs
 NCCL
@@ -371,7 +347,6 @@ PCC
 PCI
 PCIe
 PEFT
-perf
 PEQT
 PIL
 PILImage
@@ -455,9 +430,7 @@ SKU
 SKUs
 SLES
 SLURM
-Slurm
 SMEM
-SMFMA
 SMI
 SMT
 SPI
@@ -469,24 +442,18 @@ SWE
 SerDes
 ShareGPT
 Shlens
-simd
 Skylake
 Softmax
 Spack
 SplitK
 Supermicro
 Szegedy
-TagRAM
 TCA
 TCC
-TCCs
 TCI
 TCIU
 TCP
 TCR
-TVM
-THREADGROUPS
-threadgroups
 TensorRT
 TensorFloat
 TF
@@ -530,11 +497,9 @@ UltraChat
 Uncached
 Unittests
 Unhandled
-unwindowed
 VALU
 VBIOS
 VCN
-verl's
 VGPR
 VGPRs
 VM
@@ -547,13 +512,11 @@ Vanhoucke
 Vulkan
 WGP
 WGPs
-WR
 WX
 WikiText
 Wojna
 Workgroups
 Writebacks
-xcc
 XCD
 XCDs
 XGBoost
@@ -574,7 +537,6 @@ ZenDNN
 accuracies
 activations
 addr
-addEventListener
 ade
 ai
 alloc
@@ -590,7 +552,6 @@ autogenerated
 autotune
 avx
 awk
-az
 backend
 backends
 bb
@@ -608,7 +569,6 @@ boson
 bosons
 br
 BrainFloat
-btn
 buildable
 bursty
 bzip
@@ -620,21 +580,18 @@ centric
 changelog
 checkpointing
 chiplet
-classList
 cmake
 cmd
 coalescable
 codename
 collater
 comgr
-compat
 completers
 composable
 concretization
 config
 configs
 conformant
-const
 constructible
 convolutional
 convolves
@@ -675,7 +632,6 @@ detections
 dev
 devicelibs
 devsel
-dgl
 dimensionality
 disambiguates
 distro
@@ -699,7 +655,6 @@ exascale
 executables
 ffmpeg
 filesystem
-forEach
 fortran
 fp
 framebuffer
@@ -708,16 +663,13 @@ galb
 gcc
 gdb
 gemm
-getAttribute
 gfortran
 gfx
 githooks
 github
 globals
 gnupg
-gpu
 grayscale
-gx
 gzip
 heterogenous
 hipBLAS
@@ -770,7 +722,6 @@ invariants
 invocating
 ipo
 jax
-json
 kdb
 kfd
 kv
@@ -791,8 +742,6 @@ logits
 lossy
 macOS
 matchers
-maxtext
-megablocks
 megatron
 microarchitecture
 migraphx
@@ -821,7 +770,6 @@ opencv
 openmp
 openssl
 optimizers
-ol
 os
 oversubscription
 pageable
@@ -831,7 +779,6 @@ parallelizing
 param
 parameterization
 passthrough
-pe
 perfcounter
 performant
 perl
@@ -861,7 +808,6 @@ profiler
 profilers
 protobuf
 pseudorandom
-px
 py
 pytorch
 recommender
@@ -869,8 +815,6 @@ recommenders
 quantile
 quantizer
 quasirandom
-querySelector
-querySelectorAll
 queueing
 qwen
 radeon
@@ -889,8 +833,6 @@ req
 resampling
 rescaling
 reusability
-rhel
-rl
 RLHF
 roadmap
 roc
@@ -935,23 +877,19 @@ scalability
 scalable
 scipy
 seealso
-selectedTag
 sendmsg
 seqs
 serializers
-setAttribute
 sglang
 shader
 sharding
 sigmoid
-sles
 sm
 smi
 softmax
 spack
 spmm
 src
-stanford
 stochastically
 strided
 subcommand
@@ -968,10 +906,8 @@ symlink
 symlinks
 sys
 tabindex
-targetContainer
 td
 tensorfloat
-tf
 th
 tokenization
 tokenize
@@ -982,9 +918,7 @@ toolchain
 toolchains
 toolset
 toolsets
-torchtitan
 torchvision
-tp
 tqdm
 tracebacks
 txt
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -10,15 +10,13 @@
 <!-- markdownlint-disable reference-links-images            -->
 <!-- markdownlint-disable no-missing-space-atx              -->
 <!-- spellcheck-disable                                     -->
-# ROCm 7.0.2 release notes
+# ROCm 6.4.3 release notes

 The release notes provide a summary of notable changes since the previous ROCm release.

 - [Release highlights](#release-highlights)

- [Supported hardware, operating system, and virtualization changes](#supported-hardware-operating-system-and-virtualization-changes)
-
- [User space, driver, and firmware dependent changes](#user-space-driver-and-firmware-dependent-changes)
+- [Operating system and hardware support changes](#operating-system-and-hardware-support-changes)

 - [ROCm components versioning](#rocm-components)

@@ -29,223 +27,54 @@ The release notes provide a summary of notable changes since the previous ROCm r
 - [ROCm upcoming changes](#rocm-upcoming-changes)

 ```{note}
-If you’re using AMD Radeon GPUs or Ryzen APUs in a workstation setting with a display connected, see the [Use ROCm on Radeon and Ryzen](https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/index.html)
+If you’re using AMD Radeon™ PRO or Radeon GPUs in a workstation setting with a display connected, see the [Use ROCm on Radeon GPUs](https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility/native_linux/native_linux_compatibility.html)
 documentation to verify compatibility and system requirements.
 ```

 ## Release highlights

-The following are notable new features and improvements in ROCm 7.0.2. For changes to individual components, see
-[Detailed component changes](#detailed-component-changes).
+ROCm 6.4.3 is a quality release that resolves the following issues. For changes to individual components, see [Detailed component changes](#detailed-component-changes).

-### Supported hardware, operating system, and virtualization changes
+### AMDGPU driver updates

-ROCm 7.0.2 adds support for the RDNA4 architecture-based [AMD Radeon RX 9060](https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9060.html). For more information about supported AMD hardware, see [Supported GPUs (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.0.2/reference/system-requirements.html#supported-gpus).
+* Resolved an issue causing performance degradation in communication operations, caused by increased latency in certain RCCL applications. The fix prevents unnecessary queue eviction during the fork process.
+* Fixed an issue in the AMDGPU driver’s scheduler constraints that could cause queue preemption to fail during workload execution.

-ROCm 7.0.2 adds support for the following operating systems and kernel versions:
-
-* Debian 13 (kernel: 6.12)
-* Oracle Linux 10 (kernel: 6.12.0 [UEK])
-* RHEL 10.0 (kernel: 6.12.0-55)
-
-For more information about supported operating systems, see [Supported operating systems](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.0.2/reference/system-requirements.html#supported-operating-systems) and [install instructions](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.0.2/).
-
-#### Virtualization support
-
-Virtualization support remains unchanged in this release. For more information, see  [Virtualization Support](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.0.2/reference/system-requirements.html#virtualization-support).
-
-### User space, driver, and firmware dependent changes
-
-The software for AMD Datacenter GPU products requires maintaining a hardware
-and software stack with interdependencies between the GPU and baseboard
-firmware, AMD GPU drivers, and the ROCm user space software.
-
-<div class="pst-scrollable-table-container">
-  <table class="table" align="left" valign="middle">
-    <thead>
-      <tr>
-          <th class="head">
-            <p>ROCm Version</p>
-          </th>
-          <th class="head">
-            <p>GPU</p>
-          </th>
-          <th class="head">
-            <p>PLDM Bundle (Firmware)</p>
-          </th>
-          <th class="head">
-            <p>AMD GPU Driver (amdgpu)</p>
-          </th>
-          <th class="head">
-            <p>AMD GPU <br>
-              Virtualization Driver (GIM)</p>
-          </th>
-      </tr>
-    </thead>
-    <style>
-        tbody#virtualization-support-instinct tr:last-child {
-          border-bottom: 2px solid var(--pst-color-primary);
-        }
-    </style>
-      <tr>
-          <td rowspan="9" style="vertical-align: middle;">ROCm 7.0.2</td>
-          <td>MI355X</td>
-          <td>
-              01.25.15.02 (or later)<br>
-              01.25.13.09
-          </td>
-          <td>30.10.2<br>
-              30.10.1<br>
-              30.10</td>
-          <td rowspan="3" style="vertical-align: middle;">8.4.1.K</td>
-      </tr>
-      <tr>
-          <td>MI350X</td>
-          <td>
-              01.25.15.02 (or later)<br>
-              01.25.13.09
-          </td>
-          <td>30.10.2<br>
-              30.10.1<br>
-              30.10</td>
-      </tr>
-      <tr>
-          <td>MI325X</td>
-          <td>
-              01.25.04.02 (or later)<br>
-              01.25.03.03
-          </td>
-          <td>
-              30.10.2<br>
-              30.10.1<br>
-              30.10<br>
-              6.4.z where z (0-3)<br>
-              6.3.y where y (1-3)
-          </td>
-      </tr>
-      <tr>
-          <td>MI300X</td>
-          <td>01.25.05.00 (or later)<a href="#footnote1"><sup>[1]</sup></a><br>
-              01.25.03.12</td>
-          <td rowspan="6" style="vertical-align: middle;">
-              30.10.2<br>
-              30.10.1<br>
-              30.10<br>
-              6.4.z where z (0–3)<br>
-              6.3.y where y (0–3)<br>
-              6.2.x where x (1–4)
-          </td>
-          <td>8.4.1.K</td>
-      </tr>
-      <tr>
-          <td>MI300A</td>
-          <td>BKC 26 (or later)<br>
-              BKC 25</td>
-          <td rowspan="3" style="vertical-align: middle;">Not Applicable</td>
-      </tr>
-      <tr>
-          <td>MI250X</td>
-          <td>IFWI 47 (or later)</td>
-      </tr>
-      <tr>
-          <td>MI250</td>
-          <td>MU5 w/ IFWI 75 (or later)</td>
-      </tr>
-      <tr>
-          <td>MI210</td>
-          <td>MU5 w/ IFWI 75 (or later)</td>
-          <td>8.4.0.K</td>
-      </tr>
-      <tr>
-          <td>MI100</td>
-          <td>VBIOS D3430401-037</td>
-          <td>Not Applicable</td>
-      </tr>
-  </table>
-</div>
-
-<p id="footnote1">[1]: PLDM bundle 01.25.05.00 will be available by October 31, 2025.</p>
-
-#### AMD Instinct MI300X GPU resiliency improvement
-
-Multimedia Engine Reset is now supported in AMD GPU Driver (amdgpu) 30.10.2 for AMD Instinct MI300X GPUs. This finer-grain GPU resiliency feature allows recovery from faults related to VCN or JPEG without requiring a full GPU reset, thereby improving system stability and fault tolerance. Note that VCN queue reset functionality requires PLDM bundle 01.25.05.00 (or later) firmware.
-
-#### New OS support in ROCm dependent on AMD GPU Driver
-
-ROCm support for RHEL 10.0 and Oracle 10 requires AMD GPU Driver 30.10.2 or later.
-
-### RAG AI support enabled for ROCm
-
-In September 2025, Retrieval-Augmented Generation (RAG) was added to the ROCm platform. Use RAG to build and deploy end-to-end AI pipelines on AMD GPUs. It enhances the accuracy and reliability of a large language model (LLM) by exposing it to up-to-date, relevant information. When queried, RAG retrieves relevant data from its knowledge base and uses it in conjunction with the query to generate accurate and informed responses. This approach minimizes hallucinations (the creation of false information) while also enabling the model to access current information not present in its original training data. For more information, see the [ROCm-RAG documentation](https://rocm.docs.amd.com/projects/rocm-rag/en/latest/index.html).
-
-### gsplat support enabled for ROCm
-
-[Gaussian splatting (gsplat)](https://rocm.docs.amd.com/projects/gsplat/en/latest/index.html) is an open-source library for GPU-accelerated differentiable rasterization of 3D Gaussians with Python bindings. This ROCm-enabled release of gsplat is built on top of [PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.3/install/3rd-party/pytorch-install.html), enabling innovators in computer graphics, machine learning, and 3D vision to leverage GPU acceleration with AMD Instinct GPUs. With gsplat, you can build, research, and innovate with Gaussian splatting. To install gsplat on ROCm, see [installation instructions](https://rocm.docs.amd.com/projects/gsplat/en/latest/install/gsplat-install.html).
-
-### Introducing ROCm Life Science (ROCm-LS) toolkit
-
-The ROCm Life Science (ROCm-LS) toolkit is an open-source software collection for high-performance life science and healthcare applications built on the core ROCm platform. It helps you accelerate life science processing and analyze workloads on AMD GPUs. ROCm-LS is in an early access state. Running production workloads is not recommended. For more information, see the [AMD ROCm-LS documentation](https://rocm.docs.amd.com/projects/rocm-ls/en/latest/).
-
-ROCm-LS provides the following tools to build a complete workflow for life science acceleration on AMD GPUs:
-
-* The hipCIM library provides powerful support for GPU-accelerated I/O operations, coupled with an array of computer vision and image processing primitives designed for N-dimensional image data in fields such as biomedical imaging. For more information, see the [hipCIM documentation](https://rocm.docs.amd.com/projects/hipCIM/en/latest/).
-
-* MONAI for AMD ROCm, a ROCm-enabled version of [MONAI](https://monai.io/), is built on top of [PyTorch for AMD ROCm](https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/), helping healthcare and life science innovators to leverage GPU acceleration with AMD Instinct GPUs for high-performance inference and training of medical AI applications. For more information, see the [MONAI for AMD ROCm documentation](https://rocm.docs.amd.com/projects/monai/en/latest/).
-
-### Deep learning and AI framework updates
-
-ROCm provides a comprehensive ecosystem for deep learning development. For more information, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-7.0.2/how-to/deep-learning-rocm.html) and the [Compatibility
-matrix](../../docs/compatibility/compatibility-matrix.rst) for the complete list of Deep learning and AI framework versions tested for compatibility with ROCm.
-
-#### Updated framework support
-
-ROCm 7.0.0 introduces several newly supported versions of Deep learning and AI frameworks:
-
-##### PyTorch
-
-ROCm 7.0.2 enables support for PyTorch 2.8.
-
-#### New frameworks
-
-AMD ROCm has officially added support for the following Deep learning and AI frameworks:
-
-* FlashInfer is a library and kernel generator for Large Language Models (LLMs) that provides a high-performance implementation of graphics processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well as advanced performance across diverse scenarios. It is supported on ROCm 6.4.1. For more information, see [FlashInfer compatibility](https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/flashinfer-compatibility.html).
-
-* llama.cpp is an open-source framework for Large Language Model (LLM) inference that runs on both central processing units (CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing a simple, dependency-free setup. It is now supported on ROCm 7.0.0 and 6.4.x. For more information, see [llama.cpp compatibility](https://rocm.docs.amd.com/en/docs-7.0.0/compatibility/ml-compatibility/llama-cpp-compatibility.html).
-
-### ROCm Offline Installer Creator updates
-
-The ROCm Offline Installer Creator 7.0.2 includes the following features and improvements:
-
-* Added support for RHEL 10.0, Oracle Linux 10, and Debian 13.
-* Added support for creating an offline installer for Debian 12 when the kernel version of the target operating system differs from the operating system of the host creating the installer.
-* Removed the restriction requiring the kernels for the host and target systems to match when creating a ROCm-only (no AMD GPU Driver) offline installer.
-
-See [ROCm Offline Installer Creator](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.0.2/install/rocm-offline-installer.html) for more information.
-
-### ROCm Runfile Installer updates
-
-The ROCm Runfile Installer 7.0.2 adds the following features and improvements:
-
-* Added support for RHEL 10.0, Oracle Linux 10, and Debian 13.
-* Minor fixes for the `untar` mode.
-For more information, see [ROCm Runfile Installer](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.0.2/install/rocm-runfile-installer.html).
+### ROCm SMI update
+* Fixed the failure to load GPU data like System Clock (SCLK) by adjusting the logic for retrieving GPU board voltage.

 ### ROCm documentation updates

 ROCm documentation continues to be updated to provide clearer and more comprehensive guidance for a wider variety of user needs and use cases.

-* [Tutorials for AI developers](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/) have been expanded with the following two new inference tutorials:
-    * [Accelerating DeepSeek-V3 inference using multi-token prediction in SGLang](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/inference/mtp.html)
-    * [Multi-agents with Google ADK and A2A protocol](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/inference/power-Google-ADK-on-AMD-platform-and-local-LLMs.html)
+* [Tutorials for AI developers](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/) have been expanded with the following five new tutorials:
+    * Inference tutorials
+        * [ChatQnA vLLM deployment and performance evaluation](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/inference/opea_deployment_and_evaluation.html)
+        * [Text-to-video generation with ComfyUI](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/inference/t2v_comfyui_radeon.html)
+        * [DeepSeek Janus Pro on CPU or GPU](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/inference/deepseek_janus_cpu_gpu.html)
+        * [DeepSeek-R1 with vLLM V1](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/inference/vllm_v1_DSR1.html)
+    * GPU development and optimization tutorial: [MLA decoding kernel of AITER library](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/gpu_dev_optimize/aiter_mla_decode_kernel.html)
+ 
+    For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).

-    For more information about the changes, see the [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).
+* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning frameworks:
+
+    * Taichi is an open-source, imperative, and parallel programming language designed for high-performance numerical computation. Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate compute-intensive Python code by compiling it to native GPU or CPU instructions. It is currently supported on ROCm 6.3.2. For more information, see [Taichi compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/taichi-compatibility.html).
+    * Megablocks is a light-weight library for mixture-of-experts (MoE) training. The core of the system is efficient "dropless-MoE" and standard MoE layers. Megablocks is integrated with Megatron-LM, where data and pipeline parallel training of MoEs is supported. It is currently supported on ROCm 6.3.0. For more information, see [Megablocks compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/megablocks-compatibility.html).
+
+* The [Data types and precision support](https://rocm.docs.amd.com/en/latest/reference/precision-support.html) topic now includes new hardware and library support information.
+
+## Operating system and hardware support changes
+
+Operating system and hardware support remain unchanged in this release.
+
+See the [Compatibility
+matrix](../../docs/compatibility/compatibility-matrix.rst)
+for more information about operating system and hardware compatibility.

 ## ROCm components

-The following table lists the versions of ROCm components for ROCm 7.0.2, including any version
-changes from 7.0.1 to 7.0.2. Click the component's updated version to go to a list of its changes.
-
+The following table lists the versions of ROCm components for ROCm 6.4.3.
 Click {fab}`github` to go to the component's source code on GitHub.

 <div class="pst-scrollable-table-container">
@@ -267,48 +96,48 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="9">Libraries</th>
                <th rowspan="9">Machine learning and computer vision</th>
-                <td><a href="https://rocm.docs.amd.com/projects/composable_kernel/en/docs-7.0.2/index.html">Composable Kernel</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/composable_kernel/en/docs-6.4.3/index.html">Composable Kernel</a></td>
                <td>1.1.0</td>
                <td><a href="https://github.com/ROCm/composable_kernel"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/AMDMIGraphX/en/docs-7.0.2/index.html">MIGraphX</a></td>
-                <td>2.13.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/AMDMIGraphX/en/docs-6.4.3/index.html">MIGraphX</a></td>
+                <td>2.12.0</td>
                <td><a href="https://github.com/ROCm/AMDMIGraphX"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/MIOpen/en/docs-7.0.2/index.html">MIOpen</a></td>
-                <td>3.5.0</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/miopen"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/MIOpen/en/docs-6.4.3/index.html">MIOpen</a></td>
+                <td>3.4.0</td>
+                <td><a href="https://github.com/ROCm/MIOpen"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/MIVisionX/en/docs-7.0.2/index.html">MIVisionX</a></td>
-                <td>3.3.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/MIVisionX/en/docs-6.4.3/index.html">MIVisionX</a></td>
+                <td>3.2.0</td>
                <td><a href="https://github.com/ROCm/MIVisionX"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocAL/en/docs-7.0.2/index.html">rocAL</a></td>
-                <td>2.3.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocAL/en/docs-6.4.3/index.html">rocAL</a></td>
+                <td>2.2.0</td>
                <td><a href="https://github.com/ROCm/rocAL"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocDecode/en/docs-7.0.2/index.html">rocDecode</a></td>
-                <td>1.0.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocDecode/en/docs-6.4.3/index.html">rocDecode</a></td>
+                <td>0.10.0</td>
                <td><a href="https://github.com/ROCm/rocDecode"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocJPEG/en/docs-7.0.2/index.html">rocJPEG</a></td>
-                <td>1.1.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocJPEG/en/docs-6.4.3/index.html">rocJPEG</a></td>
+                <td>0.8.0</td>
                <td><a href="https://github.com/ROCm/rocJPEG"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocPyDecode/en/docs-7.0.2/index.html">rocPyDecode</a></td>
-                <td>0.6.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocPyDecode/en/docs-6.4.3/index.html">rocPyDecode</a></td>
+                <td>0.3.1</td>
                <td><a href="https://github.com/ROCm/rocPyDecode"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rpp/en/docs-7.0.2/index.html">RPP</a></td>
-                <td>2.0.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rpp/en/docs-6.4.3/index.html">RPP</a></td>
+                <td>1.9.10</td>
                <td><a href="https://github.com/ROCm/rpp"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
        </tbody>
@@ -316,13 +145,13 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="2"></th>
                <th rowspan="2">Communication</th>
-                <td><a href="https://rocm.docs.amd.com/projects/rccl/en/docs-7.0.2/index.html">RCCL</a></td>
-                <td>2.26.6&nbsp;&Rightarrow;&nbsp;<a href="#rccl-2-26-6">2.26.6</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rccl/en/docs-6.4.3/index.html">RCCL</a></td>
+                <td>2.22.3</td>
                <td><a href="https://github.com/ROCm/rccl"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-            <td><a href="https://rocm.docs.amd.com/projects/rocSHMEM/en/docs-7.0.2/index.html">rocSHMEM</a></td>
-                <td>3.0.0</td>
+            <td><a href="https://rocm.docs.amd.com/projects/rocSHMEM/en/docs-6.4.3/index.html">rocSHMEM</a></td>
+                <td>2.0.1</td>
                <td><a href="https://github.com/ROCm/rocSHMEM"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
        </tbody>
@@ -330,136 +159,136 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="16"></th>
                <th rowspan="16">Math</th>
-                <td><a href="https://rocm.docs.amd.com/projects/hipBLAS/en/docs-7.0.2/index.html">hipBLAS</a></td>
-                <td>3.0.0&nbsp;&Rightarrow;&nbsp;<a href="#hipblas-3-0-2">3.0.2</a></td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblas"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipBLAS/en/docs-6.4.3/index.html">hipBLAS</a></td>
+                <td>2.4.0</td>
+                <td><a href="https://github.com/ROCm/hipBLAS"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipBLASLt/en/docs-7.0.2/index.html">hipBLASLt</a></td>
-                <td>1.0.0</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipBLASLt/en/docs-6.4.3/index.html">hipBLASLt</a></td>
+                <td>0.12.1</td>
+                <td><a href="https://github.com/ROCm/hipBLASLt"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-7.0.2/index.html">hipFFT</a></td>
-                <td>1.0.20</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-6.4.3/index.html">hipFFT</a></td>
+                <td>1.0.18</td>
+                <td><a href="https://github.com/ROCm/hipFFT"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipfort/en/docs-7.0.2/index.html">hipfort</a></td>
-                <td>0.7.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipfort/en/docs-6.4.3/index.html">hipfort</a></td>
+                <td>0.6.0</td>
                <td><a href="https://github.com/ROCm/hipfort"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipRAND/en/docs-7.0.2/index.html">hipRAND</a></td>
-                <td>3.0.0</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiprand"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipRAND/en/docs-6.4.3/index.html">hipRAND</a></td>
+                <td>2.12.0</td>
+                <td><a href="https://github.com/ROCm/hipRAND"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-7.0.2/index.html">hipSOLVER</a></td>
-                <td>3.0.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-6.4.3/index.html">hipSOLVER</a></td>
+                <td>2.4.0</td>
                <td><a href="https://github.com/ROCm/hipSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-7.0.2/index.html">hipSPARSE</a></td>
-                <td>4.0.1</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparse"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-6.4.3/index.html">hipSPARSE</a></td>
+                <td>3.2.0</td>
+                <td><a href="https://github.com/ROCm/hipSPARSE"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSELt/en/docs-7.0.2/index.html">hipSPARSELt</a></td>
-                <td>0.2.4</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparselt"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSELt/en/docs-6.4.3/index.html">hipSPARSELt</a></td>
+                <td>0.2.3</td>
+                <td><a href="https://github.com/ROCm/hipSPARSELt"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocALUTION/en/docs-7.0.2/index.html">rocALUTION</a></td>
-                <td>4.0.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocALUTION/en/docs-6.4.3/index.html">rocALUTION</a></td>
+                <td>3.2.3</td>
                <td><a href="https://github.com/ROCm/rocALUTION"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocBLAS/en/docs-7.0.2/index.html">rocBLAS</a></td>
-                <td>5.0.0&nbsp;&Rightarrow;&nbsp;<a href="#rocblas-5-0-2">5.0.2</a></td></td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocblas"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocBLAS/en/docs-6.4.3/index.html">rocBLAS</a></td>
+                <td>4.4.1</td></td>
+                <td><a href="https://github.com/ROCm/rocBLAS"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocFFT/en/docs-7.0.2/index.html">rocFFT</a></td>
-                <td>1.0.34</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocFFT/en/docs-6.4.3/index.html">rocFFT</a></td>
+                <td>1.0.32</td>
+                <td><a href="https://github.com/ROCm/rocFFT"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocRAND/en/docs-7.0.2/index.html">rocRAND</a></td>
-                <td>4.0.0</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocrand"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocRAND/en/docs-6.4.3/index.html">rocRAND</a></td>
+                <td>3.3.0</td>
+                <td><a href="https://github.com/ROCm/rocRAND"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-7.0.2/index.html">rocSOLVER</a></td>
-                <td>3.30.0&nbsp;&Rightarrow;&nbsp;<a href="#rocsolver-3-30-1">3.30.1</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-6.4.3/index.html">rocSOLVER</a></td>
+                <td>3.28.2</td>
                <td><a href="https://github.com/ROCm/rocSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-7.0.2/index.html">rocSPARSE</a></td>
-                <td>4.0.2&nbsp;&Rightarrow;&nbsp;<a href="#rocsparse-4-0-3">4.0.3</a></td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsparse"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-6.4.3/index.html">rocSPARSE</a></td>
+                <td>3.4.0</td>
+                <td><a href="https://github.com/ROCm/rocSPARSE"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocWMMA/en/docs-7.0.2/index.html">rocWMMA</a></td>
-                <td>2.0.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocWMMA/en/docs-6.4.3/index.html">rocWMMA</a></td>
+                <td>1.7.0</td>
                <td><a href="https://github.com/ROCm/rocWMMA"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/Tensile/en/docs-7.0.2/src/index.html">Tensile</a></td>
-                <td>4.44.0</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/shared/tensile"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/Tensile/en/docs-6.4.3/src/index.html">Tensile</a></td>
+                <td>4.43.0</td>
+                <td><a href="https://github.com/ROCm/Tensile"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
        </tbody>
        <tbody class="rocm-components-libs rocm-components-primitives tbody-reverse-zebra">
            <tr>
                <th rowspan="4"></th>
                <th rowspan="4">Primitives</th>
-                <td><a href="https://rocm.docs.amd.com/projects/hipCUB/en/docs-7.0.2/index.html">hipCUB</a></td>
-                <td>4.0.0</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipCUB/en/docs-6.4.3/index.html">hipCUB</a></td>
+                <td>3.4.0</td>
+                <td><a href="https://github.com/ROCm/hipCUB"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipTensor/en/docs-7.0.2/index.html">hipTensor</a></td>
-                <td>2.0.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipTensor/en/docs-6.4.3/index.html">hipTensor</a></td>
+                <td>1.5.0</td>
                <td><a href="https://github.com/ROCm/hipTensor"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocPRIM/en/docs-7.0.2/index.html">rocPRIM</a></td>
-                <td>4.0.0&nbsp;&Rightarrow;&nbsp;<a href="#rocprim-4-0-1">4.0.1</a></td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocprim"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocPRIM/en/docs-6.4.3/index.html">rocPRIM</a></td>
+                <td>3.4.1</td>
+                <td><a href="https://github.com/ROCm/rocPRIM"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocThrust/en/docs-7.0.2/index.html">rocThrust</a></td>
-                <td>4.0.0</td>
-                <td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocthrust"><i class="fab fa-github fa-lg"></i></a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocThrust/en/docs-6.4.3/index.html">rocThrust</a></td>
+                <td>3.3.0</td>
+                <td><a href="https://github.com/ROCm/rocThrust"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
        </tbody>
        <tbody class="rocm-components-tools rocm-components-system tbody-reverse-zebra">
            <tr>
                <th rowspan="7">Tools</th>
                <th rowspan="7">System management</th>
-                <td><a href="https://rocm.docs.amd.com/projects/amdsmi/en/docs-7.0.2/index.html">AMD SMI</a></td>
-                <td>26.0.0&nbsp;&Rightarrow;&nbsp;<a href="#amd-smi-26-0-1">26.0.1</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/amdsmi/en/docs-6.4.3/index.html">AMD SMI</a></td>
+                <td>25.5.1</a></td>
                <td><a href="https://github.com/ROCm/amdsmi"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rdc/en/docs-7.0.2/index.html">ROCm Data Center Tool</a></td>
-                <td>1.1.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rdc/en/docs-6.4.3/index.html">ROCm Data Center Tool</a></td>
+                <td>0.3.0</td>
                <td><a href="https://github.com/ROCm/rdc"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocminfo/en/docs-7.0.2/index.html">rocminfo</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocminfo/en/docs-6.4.3/index.html">rocminfo</a></td>
                <td>1.0.0</td>
                <td><a href="https://github.com/ROCm/rocminfo"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocm_smi_lib/en/docs-7.0.2/index.html">ROCm SMI</a></td>
-                <td>7.8.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocm_smi_lib/en/docs-6.4.3/index.html">ROCm SMI</a></td>
+                <td>7.5.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-smi-7-7-0">7.7.0</td>
                <td><a href="https://github.com/ROCm/rocm_smi_lib"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/docs-7.0.2/index.html">ROCm Validation Suite</a></td>
-                <td>1.2.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/docs-6.4.3/index.html">ROCm Validation Suite</a></td>
+                <td>1.1.0</td>
                <td><a href="https://github.com/ROCm/ROCmValidationSuite"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
        </tbody>
@@ -467,38 +296,38 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="6"></th>
                <th rowspan="6">Performance</th>
-                <td><a href="https://rocm.docs.amd.com/projects/rocm_bandwidth_test/en/docs-7.0.2/index.html">ROCm Bandwidth
+                <td><a href="https://rocm.docs.amd.com/projects/rocm_bandwidth_test/en/docs-6.4.3/index.html">ROCm Bandwidth
                        Test</a></td>
-                <td>2.6.0</td>
+                <td>1.4.0</td>
                <td><a href="https://github.com/ROCm/rocm_bandwidth_test/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-compute/en/docs-7.0.2/index.html">ROCm Compute Profiler</a></td>
-                <td>3.2.3</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-compute/en/docs-6.4.3/index.html">ROCm Compute Profiler</a></td>
+                <td>3.1.1</td>
                <td><a href="https://github.com/ROCm/rocprofiler-compute"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-systems/en/docs-7.0.2/index.html">ROCm Systems Profiler</a></td>
-                <td>1.1.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-systems-profiler-1-1-1">1.1.1</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-systems/en/docs-6.4.3/index.html">ROCm Systems Profiler</a></td>
+                <td>1.0.2</td>
                <td><a href="https://github.com/ROCm/rocprofiler-systems"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler/en/docs-7.0.2/index.html">ROCProfiler</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.4.3/index.html">ROCProfiler</a></td>
                <td>2.0.0</td>
                <td><a href="https://github.com/ROCm/ROCProfiler/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/docs-7.0.2/index.html">ROCprofiler-SDK</a></td>
-                <td>1.0.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/docs-6.4.3/index.html">ROCprofiler-SDK</a></td>
+                <td>0.6.0</td>
                <td><a href="https://github.com/ROCm/rocprofiler-sdk/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr >
-                <td><a href="https://rocm.docs.amd.com/projects/roctracer/en/docs-7.0.2/index.html">ROCTracer</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/roctracer/en/docs-6.4.3/index.html">ROCTracer</a></td>
                <td>4.1.0</td>
                <td><a href="https://github.com/ROCm/ROCTracer/"><i
                            class="fab fa-github fa-lg"></i></a></td>
@@ -508,34 +337,34 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="5"></th>
                <th rowspan="5">Development</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIPIFY/en/docs-7.0.2/index.html">HIPIFY</a></td>
-                <td>20.0.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/HIPIFY/en/docs-6.4.3/index.html">HIPIFY</a></td>
+                <td>19.0.0</td>
                <td><a href="https://github.com/ROCm/HIPIFY/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCdbgapi/en/docs-7.0.2/index.html">ROCdbgapi</a></td>
-                <td>0.77.3&nbsp;&Rightarrow;&nbsp;<a href="#rocdbgapi-0-77-4">0.77.4</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCdbgapi/en/docs-6.4.3/index.html">ROCdbgapi</a></td>
+                <td>0.77.2</td>
                <td><a href="https://github.com/ROCm/ROCdbgapi/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCmCMakeBuildTools/en/docs-7.0.2/index.html">ROCm CMake</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCmCMakeBuildTools/en/docs-6.4.3/index.html">ROCm CMake</a></td>
                <td>0.14.0</td>
                <td><a href="https://github.com/ROCm/rocm-cmake/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCgdb/en/docs-7.0.2/index.html">ROCm Debugger (ROCgdb)</a>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCgdb/en/docs-6.4.3/index.html">ROCm Debugger (ROCgdb)</a>
                </td>
-                <td>16.3</td>
+                <td>15.2</td>
                <td><a href="https://github.com/ROCm/ROCgdb/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocr_debug_agent/en/docs-7.0.2/index.html">ROCr Debug Agent</a>
+                <td><a href="https://rocm.docs.amd.com/projects/rocr_debug_agent/en/docs-6.4.3/index.html">ROCr Debug Agent</a>
                </td>
-                <td>2.1.0</td>
+                <td>2.0.4</td>
                <td><a href="https://github.com/ROCm/rocr_debug_agent/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -543,14 +372,14 @@ Click {fab}`github` to go to the component's source code on GitHub.
        <tbody class="rocm-components-compilers tbody-reverse-zebra">
            <tr>
                <th rowspan="2" colspan="2">Compilers</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIPCC/en/docs-7.0.2/index.html">HIPCC</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/HIPCC/en/docs-6.4.3/index.html">HIPCC</a></td>
                <td>1.1.1</td>
                <td><a href="https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/llvm-project/en/docs-7.0.2/index.html">llvm-project</a></td>
-                <td>20.0.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/llvm-project/en/docs-6.4.3/index.html">llvm-project</a></td>
+                <td>19.0.0</td>
                <td><a href="https://github.com/ROCm/llvm-project/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -558,13 +387,13 @@ Click {fab}`github` to go to the component's source code on GitHub.
        <tbody class="rocm-components-runtimes tbody-reverse-zebra">
            <tr>
                <th rowspan="2" colspan="2">Runtimes</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIP/en/docs-7.0.2/index.html">HIP</a></td>
-                <td>7.0.0&nbsp;&Rightarrow;&nbsp;<a href="#hip-7-0-2">7.0.2</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/HIP/en/docs-6.4.3/index.html">HIP</a></td>
+                <td>6.4.3</td>
                <td><a href="https://github.com/ROCm/HIP/"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCR-Runtime/en/docs-7.0.2/index.html">ROCr Runtime</a></td>
-                <td>1.18.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCR-Runtime/en/docs-6.4.3/index.html">ROCr Runtime</a></td>
+                <td>1.15.0</td>
                <td><a href="https://github.com/ROCm/ROCR-Runtime/"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
        </tbody>
@@ -579,146 +408,28 @@ The following sections describe key changes to ROCm components.
 For a historical overview of ROCm component updates, see the {doc}`ROCm consolidated changelog </release/changelog>`.
 ```

-### **AMD SMI** (26.0.1)
+### **ROCm SMI** (7.7.0)

 #### Added

-* Added `bad_page_threshold_exceeded` field to `amd-smi static --ras`, which compares retired pages count against bad page threshold. This field displays `True` if retired pages exceed the threshold, `False` if within threshold, or `N/A` if threshold data is unavailable. Note that `sudo` is required to have the `bad_page_threshold_exceeded` field populated.
+- Support for getting the GPU Board voltage.

-#### Removed
-
-* Removed gpuboard and baseboard temperatures enums in amdsmi Python Library.
-    * `AmdSmiTemperatureType` had issues with referencing the correct attribute. As such, the following duplicate enums have been removed:
-        - `AmdSmiTemperatureType.GPUBOARD_NODE_FIRST`
-        - `AmdSmiTemperatureType.GPUBOARD_VR_FIRST`
-        - `AmdSmiTemperatureType.BASEBOARD_FIRST`
-
-#### Resolved Issues
-
-* Fixed `attribute error` in `amd-smi monitor` on Linux Guest systems, where the violations argument caused CLI to break.
-* Fixed certain output in `amd-smi monitor` when GPUs are partitioned.  
-  * It fixes the amd-smi monitor such as: `amd-smi monitor -Vqt`, `amd-smi monitor -g 0 -Vqt -w 1`, `amd-smi monitor -Vqt --file /tmp/test1`, etc. These commands will now be able to display as normal in partitioned GPU scenarios.
-
-* Fixed an issue where using `amd-smi ras --folder <folder_name>` was forcing the created folder's name to be lowercase. This fix also allows all string input options to be case insensitive.
-
-* Fixed an issue of some processes not being detected by AMD SMI despite making use of KFD resources. This fix, with the addition of KFD Fallback for process detection, ensures that all KFD processes will be detected.
-
-* Multiple CPER issues were fixed.  
-  - Issue of being unable to query for additional CPERs after 20 were generated on a single device.
-  - Issue where the RAS HBM CRC read was failing due to an incorrect AFID value.
-  - Issue where RAS injections were not consistently producing related CPERs.
-
-### **HIP** (7.0.2)
-
-#### Added
-
-* Support for the `hipMemAllocationTypeUncached` flag, enabling developers to allocate uncached memory. This flag is now supported in the following APIs:
-    - `hipMemGetAllocationGranularity` determines the recommended allocation granularity for uncached memory.
-    - `hipMemCreate` allocates memory with uncached properties.
-
-#### Resolved issues
-
-* A compilation failure affecting applications that compile kernels using `hiprtc` with the compiler option `std=c++11`.
-* A permission-related error occurred during the execution of `hipLaunchHostFunc`. This API is now supported and permitted to run during stream capture, aligning its behavior with CUDA.
-* A numerical error during graph capture of kernels that rely on a remainder in `globalWorkSize`, in frameworks like MIOpen and PyTorch, where the grid size is not a multiple of the block size. To ensure correct replay behavior, HIP runtime now stores this remainder in `hip::GraphKernelNode` during `hipExtModuleLaunchKernel` capture, enabling accurate execution and preventing corruption.
-* A page fault occurred during viewport rendering while running the file undo.blend in Blender. The issue was resolved by the HIP runtime, which reused the same context during image creation.
-* Resolved a segmentation fault in `gpu_metrics`, which is used in threshold logic for command submission patches to GPU device(s) during CPU synchronization.
-
-### **hipBLAS** (3.0.2)
- 
-#### Added
- 
-* Enabled support for gfx1150, gfx1151, gfx1200, and gfx1201 AMD hardware.
-
-### **RCCL** (2.26.6)
-
-#### Added
-
-* Enabled double-buffering in `reduceCopyPacks` to trigger pipelining, especially to overlap bf16 arithmetic.
-* Added `--force-reduce-pipeline` as an option that can be passed to the `install.sh` script. Passing this option will enable software-triggered pipelining `bfloat16` reductions (that is, `all_reduce`, `reduce_scatter`, and `reduce`).
-
-### **rocBLAS** (5.0.2)
- 
-#### Added
- 
-* Enabled gfx1150 and gfx1151.
-* The `ROCBLAS_USE_HIPBLASLT_BATCHED` variable to independently control the batched hipblaslt backend. Set `ROCBLAS_USE_HIPBLASLT_BATCHED=0` to disable batched GEMM use of the hipblaslt backend.
-
-#### Resolved issues
- 
-* Set the imaginary portion of the main diagonal of the output matrix to zero in syrk and herk.
-
-### **ROCdbgapi** (0.77.4)
-
-#### Added
-
-* ROCdbgapi documentation link in the README.md file.
-
-### **ROCm Systems Profiler** (1.1.1)
-
-#### Resolved issues
-
-* Fixed an issue where ROC-TX ranges were displayed as two separate events instead of a single spanning event.
-
-### **rocPRIM** (4.0.1)
-
-#### Resolved issues
-
-* Fixed compilation issue when using `rocprim::texture_cache_iterator`.
-* Fixed a HIP version check used to determine whether `hipStreamLegacy` is supported. This resolves runtime errors that occur when `hipStreamLegacy` is used in ROCm 7.0.0 and later.
-
-### **rocSPARSE** (4.0.3)
-
-#### Resolved issues
-
-* Fixed an issue causing premature deallocation of internal buffers while still in use.
-
-### **rocSOLVER** (3.30.1)
-
-#### Optimized
-
-Improved the performance of:
-
-* LARFT and downstream functions such as GEQRF and ORMTR.
-* LARF and downstream functions such as GEQR2.
-* ORMTR and downstream functions such as SYEVD.
-* GEQR2 and downstream functions such as GEQRF.
+```{note}
+See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
+```

 ## ROCm known issues

 ROCm known issues are noted on {fab}`github` [GitHub](https://github.com/ROCm/ROCm/labels/Verified%20Issue). For known
 issues related to individual components, review the [Detailed component changes](#detailed-component-changes).

-### ROCm debugging tools might become unresponsive in SELinux-enabled distributions
-
-Red Hat Enterprise Linux (RHEL) and related distributions automatically enable a security feature named Security-Enhanced Linux (SELinux), which may prevent ROCm debugging tools, such as ROCgdb, ROCdbgapi, and ROCR Debug Agent, from working correctly.
- 
-The problem occurs when attempting to debug a program that contains code that runs on the GPU. The debugging session might become unresponsive while attempting to reach a breakpoint or executing instruction-stepping in device code. ROCgdb will still be responsive and accept interruptions by pressing `Control+C`, but the breakpoint in device code won't be hit, and the instruction-stepping operation will not be completed.
- 
-The ROCR Debug Agent might also become unresponsive when attempting to capture data from a program that is experiencing queue errors, memory faults, or other triggering events.
- 
-For a detailed workaround, see the [Installation troubleshooting](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/install-faq.html#issue-10-rocm-debugging-tools-might-become-unresponsive-in-selinux-enabled-distributions) documentation. This issue will be fixed in a future ROCm release.
-
-### MIGraphX Python API will fail when running on Python 3.13
-
-Applications using the MIGraphX Python API will fail when running on Python 3.13 and return the error message `AttributeError: module 'migraphx' has no attribute 'parse_onnx'`. The issue does not occur when you manually build MIGraphX. For detailed instructions, see [Building from source](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/install/building_migraphx.html). As a workaround, change the Python version to the one found in the installed location:
-
-```
-ls -l /opt/rocm-7.0.0/lib/libmigraphx_py_*.so
-```
-The issue will be resolved in a future ROCm release.
-
-### Applications using OpenCV might fail due to package incompatibility between the OS
-
-OpenCV packages built on Ubuntu 24.04 are incompatible with Debian 13 due to a version conflict. As a result, applications, tests, and samples that use OpenCV might fail. To avoid the version conflict, rebuild OpenCV with the version corresponding to Debian 13, then rebuild MIVisionX on top of it. As a workaround, rebuild OpenCV from source, followed by the application that uses OpenCV. This issue will be fixed in a future ROCm release.
-
 ## ROCm upcoming changes

 The following changes to the ROCm software stack are anticipated for future releases.

-### ROCm Execution Provider (ROCm-EP) deprecation
+### AMD SMI migration to AMDGPU driver repository

-ROCm 7.0.2 is the last official AMD-supported distribution of ROCm Execution Provider (ROCm-EP). ROCm EP will be removed from all upcoming ROCm releases. Refer to this [Pull Request](https://github.com/microsoft/onnxruntime/pull/25181) for more information. Migrate your applications to use the [MIGraphX Execution Provider](https://onnxruntime.ai/docs/execution-providers/MIGraphX-ExecutionProvider.html#migraphx-execution-provider).
+In a future release, [AMD SMI](https://github.com/ROCm/amdsmi) will be relocated from the ROCm organization repository to a new AMDTools repository to better align with its system-level functionality. `amd-smi-lib` will no longer be included in the `rocm-developer-tools` meta-package included with your standard ROCm installation. Instead, it will be packaged with the AMDGPU driver installation.

 ### ROCm SMI deprecation

@@ -742,14 +453,15 @@ It's anticipated that ROCTracer, ROCProfiler, `rocprof`, and `rocprofv2` will re
 ### AMDGPU wavefront size compiler macro deprecation

 Access to the wavefront size as a compile-time constant via the `__AMDGCN_WAVEFRONT_SIZE`
-and `__AMDGCN_WAVEFRONT_SIZE__` macros are deprecated and will be disabled in a future release. In ROCm 7.0.0 `warpSize` is only available as a non-`constexpr` variable. You're encouraged to update your code if needed to ensure future compatibility.
+and `__AMDGCN_WAVEFRONT_SIZE__` macros or the `constexpr warpSize` variable is deprecated
+and will be disabled in a future release. 

 * The `__AMDGCN_WAVEFRONT_SIZE__` macro and `__AMDGCN_WAVEFRONT_SIZE` alias will be removed in an upcoming release.
  It is recommended to remove any use of this macro. For more information, see
-  [AMDGPU support](https://rocm.docs.amd.com/projects/llvm-project/en/docs-7.0.2/LLVM/clang/html/AMDGPUSupport.html).
-* `warpSize` is only available as a non-`constexpr` variable. Where required,
+  [AMDGPU support](https://rocm.docs.amd.com/projects/llvm-project/en/docs-6.4.3/LLVM/clang/html/AMDGPUSupport.html).
+* `warpSize` will only be available as a non-`constexpr` variable. Where required,
  the wavefront size should be queried via the `warpSize` variable in device code,
-  or via `hipGetDeviceProperties` in host code. Neither of these will result in a compile-time constant. For more information, see [warpSize](https://rocm.docs.amd.com/projects/HIP/en/docs-7.0.2/how-to/hip_cpp_language_extensions.html#warpsize).
+  or via `hipGetDeviceProperties` in host code. Neither of these will result in a compile-time constant. For more information, see [warpSize](https://rocm.docs.amd.com/projects/HIP/en/docs-6.4.3/how-to/hip_cpp_language_extensions.html#warpsize).
 * For cases where compile-time evaluation of the wavefront size cannot be avoided,
  uses of `__AMDGCN_WAVEFRONT_SIZE`, `__AMDGCN_WAVEFRONT_SIZE__`, or `warpSize`
  can be replaced with a user-defined macro or `constexpr` variable with the wavefront
@@ -763,9 +475,13 @@ and `__AMDGCN_WAVEFRONT_SIZE__` macros are deprecated and will be disabled in a
   #endif
 ```

+### HIPCC Perl scripts deprecation
+
+The HIPCC Perl scripts (`hipcc.pl` and `hipconfig.pl`) will be removed in an upcoming release.
+
 ### Changes to ROCm Object Tooling

-ROCm Object Tooling tools ``roc-obj-ls``, ``roc-obj-extract``, and ``roc-obj`` were
+ROCm Object Tooling tools ``roc-obj-ls``, ``roc-obj-extract``, and ``roc-obj`` are
 deprecated in ROCm 6.4, and will be removed in a future release. Functionality
 has been added to the ``llvm-objdump --offloading`` tool option to extract all
 clang-offload-bundles into individual code objects found within the objects
@@ -773,3 +489,11 @@ or executables passed as input.  The ``llvm-objdump --offloading`` tool option a
 supports the ``--arch-name`` option, and only extracts code objects found with
 the specified target architecture. See [llvm-objdump](https://llvm.org/docs/CommandGuide/llvm-objdump.html)
 for more information. 
+
+### HIP runtime API changes
+ 
+There are a number of upcoming changes planned for HIP runtime API in an upcoming major release 
+that are not backward compatible with prior releases. Most of these changes increase 
+alignment between HIP and CUDA APIs or behavior. Some of the upcoming changes are to 
+clean up header files, remove namespace collision, and have a clear separation between 
+`hipRTC` and HIP runtime. For more information, see [HIP 7.0 Is Coming: What You Need to Know to Stay Ahead](https://rocm.blogs.amd.com/ecosystems-and-partners/transition-to-hip-7.0-blog/README.html).
--- a/default.xml
+++ b/default.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-7.0.2"
+    <default revision="refs/tags/rocm-6.4.3"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
@@ -9,7 +9,6 @@
    <project name="ROCK-Kernel-Driver" />
    <project name="ROCR-Runtime" />
    <project name="amdsmi" />
-    <project name="aqlprofile" />
    <project name="rdc" />
    <project name="rocm_bandwidth_test" />
    <project name="rocm_smi_lib" />
@@ -23,7 +22,7 @@
    <project name="rocprofiler-systems" />
    <project name="roctracer" />
 <!--HIP Projects-->
-    <project name="hip" />
+    <project name="HIP" />
    <project name="hip-tests" />
    <project name="HIPIFY" />
    <project name="clr" />
@@ -38,24 +37,36 @@
    <project name="rocr_debug_agent" />
 <!-- ROCm Libraries -->
    <project groups="mathlibs" name="AMDMIGraphX" />
+    <project groups="mathlibs" name="MIOpen" />
    <project groups="mathlibs" name="MIVisionX" />
    <project groups="mathlibs" name="ROCmValidationSuite" />
+    <project groups="mathlibs" name="Tensile" />
    <project groups="mathlibs" name="composable_kernel" />
+    <project groups="mathlibs" name="hipBLAS-common" />
+    <project groups="mathlibs" name="hipBLAS" />
+    <project groups="mathlibs" name="hipBLASLt" />
+    <project groups="mathlibs" name="hipCUB" />
+    <project groups="mathlibs" name="hipFFT" />
+    <project groups="mathlibs" name="hipRAND" />
+    <project groups="mathlibs" name="hipSOLVER" />
+    <project groups="mathlibs" name="hipSPARSE" />
+    <project groups="mathlibs" name="hipSPARSELt" />
    <project groups="mathlibs" name="hipTensor" />
    <project groups="mathlibs" name="hipfort" />
    <project groups="mathlibs" name="rccl" />
    <project groups="mathlibs" name="rocAL" />
    <project groups="mathlibs" name="rocALUTION" />
+    <project groups="mathlibs" name="rocBLAS" />
    <project groups="mathlibs" name="rocDecode" />
    <project groups="mathlibs" name="rocJPEG" />
-    <!-- The following components have been migrated to rocm-libraries:
-        hipBLAS-common hipBLAS hipBLASLt hipCUB
-        hipFFT hipRAND hipSPARSE hipSPARSELt
-        MIOpen rocBLAS rocFFT rocPRIM rocRAND
-        rocSPARSE rocThrust Tensile -->
-    <project groups="mathlibs" name="rocm-libraries" />
    <project groups="mathlibs" name="rocPyDecode" />
+    <project groups="mathlibs" name="rocFFT" />
+    <project groups="mathlibs" name="rocPRIM" />
+    <project groups="mathlibs" name="rocRAND" />
    <project groups="mathlibs" name="rocSHMEM" />
+    <project groups="mathlibs" name="rocSOLVER" />
+    <project groups="mathlibs" name="rocSPARSE" />
+    <project groups="mathlibs" name="rocThrust" />
    <project groups="mathlibs" name="rocWMMA" />
    <project groups="mathlibs" name="rocm-cmake" />
    <project groups="mathlibs" name="rpp" />
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -29,7 +29,7 @@ additional licenses. Please review individual repositories for more information.
 | [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
 | [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
-| [AQLprofile](https://github.com/rocm/aqlprofile/) | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
+| [AQLprofile] | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE) |
 | [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
 | [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
 | [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
@@ -50,7 +50,7 @@ additional licenses. Please review individual repositories for more information.
 | [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
 | [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
-| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
+| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
 | [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
 | [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
 | [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
@@ -67,15 +67,15 @@ additional licenses. Please review individual repositories for more information.
 | [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
 | [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
 | [ROCm Compute Profiler](https://github.com/ROCm/rocprofiler-compute) | [MIT](https://github.com/ROCm/rocprofiler-compute/blob/amd-staging/LICENSE) |
-| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE.md) |
+| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE) |
 | [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
 | [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/amd-staging/opencl) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/opencl/LICENSE.txt) |
 | [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
-| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/LICENSE.md) |
-| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE.md) |
+| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/License.txt) |
+| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE) |
 | [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
 | [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
-| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE.md) |
+| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE) |
 | [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
 | [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
 | [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -1,137 +1,131 @@
-ROCm Version,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
-      ,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 10.0 [#rhel-10-702-past-60]_, 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
-      ,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,SLES 15 SP7 [#sles-db-700-past-60]_,SLES 15 SP7 [#sles-db-700-past-60]_,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
-      ,"Oracle Linux 10, 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
-      ,"Debian 13 [#db-mi300x-past-60]_, 12 [#sles-db-700-past-60]_",Debian 12 [#sles-db-700-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
-      ,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,,,,,,,,,,,,
-      ,Rocky Linux 9 [#rl-700-past-60]_,Rocky Linux 9 [#rl-700-past-60]_,,,,,,,,,,,,,,,,,,
-      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
-      ,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
-      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
-      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
-      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os-past-60]_,gfx950 [#mi350x-os-past-60]_,,,,,,,,,,,,,,,,,,
-      ,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
-      ,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
-      ,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
-      ,gfx90a [#mi200x-os-past-60]_,gfx90a [#mi200x-os-past-60]_,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
-      ,gfx908 [#mi100-os-past-60]_,gfx908 [#mi100-os-past-60]_,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
-      ,,,,,,,,,,,,,,,,,,,,
-      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,b6356,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
-      ,,,,,,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,,,,,,
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
-      ,,,,,,,,,,,,,,,,,,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      Thrust,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      CUB,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      ,,,,,,,,,,,,,,,,,,,,
-     DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
-      ,,,,,,,,,,,,,,,,,,,,
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
-      :doc:`rocAL <rocal:index>`,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`rocDecode <rocdecode:index>`,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
-      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`RPP <rpp:index>`,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
-      ,,,,,,,,,,,,,,,,,,,,
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`RCCL <rccl:index>`,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      ,,,,,,,,,,,,,,,,,,,,
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
-      :doc:`hipfort <hipfort:index>`,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
-      :doc:`rocRAND <rocrand:index>`,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
-      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
-      ,,,,,,,,,,,,,,,,,,,,
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`hipCUB <hipcub:index>`,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      ,,,,,,,,,,,,,,,,,,,,
-      SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
-      ,,,,,,,,,,,,,,,,,,,,
-      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
-      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
-      ,,,,,,,,,,,,,,,,,,,,
-      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
-      ,,,,,,,,,,,,,,,,,,,,
-      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,
-      :doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
-      ,,,,,,,,,,,,,,,,,,,,
-      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
-      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25381,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,20.0.0.25381,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25381,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      ,,,,,,,,,,,,,,,,,,,,
-      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      :doc:`HIP <hip:index>`,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
+ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
+      ,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,"Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
+      ,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
+      ,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
+      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+,,,,,,,,,,,,,,,,,,
+      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+,,,,,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,,,
+      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      ,,,,,,,,,,,,,,,,,,
+      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      Thrust,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      CUB,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+,,,,,,,,,,,,,,,,,,
+      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      ,,,,,,,,,,,,,,,,,,
+      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      ,,,,,,,,,,,,,,,,,,
+      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      ,,,,,,,,,,,,,,,,,,
+      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      ,,,,,,,,,,,,,,,,,,
+      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,,,,,,,,,,,,,
+      SUPPORT LIBS,,,,,,,,,,,,,,,,,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      ,,,,,,,,,,,,,,,,,,
+      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`AMD SMI <amdsmi:index>`,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      ,,,,,,,,,,,,,,,,,,
+      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      ,,,,,,,,,,,,,,,,,,
+      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,
+      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      ,,,,,,,,,,,,,,,,,,
+      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+,,,,,,,,,,,,,,,,,,
+      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`HIP <hip:index>`,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -11,9 +11,9 @@ Use this matrix to view the ROCm compatibility and system requirements across su
 You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-rocm-compatibility-matrix>`.

 Accelerators and GPUs listed in the following table support compute workloads (no display
-information or graphics). If you’re using ROCm with AMD Radeon GPUs or Ryzen APUs for graphics
-workloads, see the `Use ROCm on Radeon and Ryzen
-<https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/index.html>`_ to verify
+information or graphics). If you’re using ROCm with AMD Radeon or Radeon Pro GPUs for graphics
+workloads, see the `Use ROCm on Radeon GPU documentation
+<https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility.html>`_ to verify
 compatibility and system requirements.

 .. |br| raw:: html
@@ -23,162 +23,142 @@ compatibility and system requirements.
 .. container:: format-big-table

  .. csv-table::
-      :header: "ROCm Version", "7.0.2", "7.0.1/7.0.0", "6.4.0"
+      :header: "ROCm Version", "6.4.3", "6.4.2", "6.3.0"
      :stub-columns: 1

-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2
      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
-      ,"RHEL 10.0 [#rhel-10-702]_, 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.5, 9.4"
-      ,RHEL 8.10 [#rhel-700]_,RHEL 8.10 [#rhel-700]_,RHEL 8.10
-      ,SLES 15 SP7 [#sles-db-700]_,SLES 15 SP7 [#sles-db-700]_,SLES 15 SP6
-      ,"Oracle Linux 10, 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-mi300x]_"
-      ,"Debian 13 [#db-mi300x]_, 12 [#sles-db-700]_",Debian 12 [#sles-db-700]_,Debian 12 [#single-node]_
-      ,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_
-      ,Rocky Linux 9 [#rl-700]_,Rocky Linux 9 [#rl-700]_,
+      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.5, 9.4"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10
+      ,"SLES 15 SP7, SP6","SLES 15 SP7, SP6","SLES 15 SP6, SP5"
+      ,"Oracle Linux 9, 8 [#mi300x]_","Oracle Linux 9, 8 [#mi300x]_",Oracle Linux 8.10 [#mi300x]_
+      ,Debian 12 [#single-node]_,Debian 12 [#single-node]_,
+      ,Azure Linux 3.0 [#mi300x]_,Azure Linux 3.0 [#mi300x]_,
      ,.. _architecture-support-compatibility-matrix:,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,
-      ,CDNA3,CDNA3,CDNA3
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
      ,CDNA2,CDNA2,CDNA2
      ,CDNA,CDNA,CDNA
      ,RDNA4,RDNA4,
      ,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix:,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os]_,gfx950 [#mi350x-os]_,
-      ,gfx1201 [#RDNA-OS-700]_,gfx1201 [#RDNA-OS-700]_,
-      ,gfx1200 [#RDNA-OS-700]_,gfx1200 [#RDNA-OS-700]_,
-      ,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,
-      ,gfx1100 [#RDNA-OS-700]_,gfx1100 [#RDNA-OS-700]_,gfx1100
-      ,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030
-      ,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942
-      ,gfx90a [#mi200x-os]_,gfx90a [#mi200x-os]_,gfx90a
-      ,gfx908 [#mi100-os]_,gfx908 [#mi100-os]_,gfx908
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS]_,gfx1201 [#RDNA-OS]_,
+      ,gfx1200 [#RDNA-OS]_,gfx1200 [#RDNA-OS]_,
+      ,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,
+      ,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942
+      ,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908
      ,,,
      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1, 2.17.1 [#tf-mi350]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350]_","2.18.1, 2.17.1, 2.16.2"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.6.0,0.4.35
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0
-      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,b6356,b5997
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,85f95ae
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,0.7.0
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3
      ,,,
      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0
      ,,,
      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
-      Thrust,2.6.0,2.6.0,2.5.0
-      CUB,2.6.0,2.6.0,2.5.0
+      Thrust,2.5.0,2.5.0,2.3.2
+      CUB,2.5.0,2.5.0,2.3.2
      ,,,
-      DRIVER & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
-      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10.2, 30.10.1 [#driver_patch]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
+      KMD & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
      ,,,
      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.13.0,2.12.0
-      :doc:`MIOpen <miopen:index>`,3.5.0,3.5.0,3.4.0
-      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.3.0,3.2.0
-      :doc:`rocAL <rocal:index>`,2.3.0,2.3.0,2.2.0
-      :doc:`rocDecode <rocdecode:index>`,1.0.0,1.0.0,0.10.0
-      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,1.1.0,0.8.0
-      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.6.0,0.3.1
-      :doc:`RPP <rpp:index>`,2.0.0,2.0.0,1.9.10
+      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.11.0
+      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.1.0
+      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.1.0
+      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.8.0
+      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.6.0
+      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.2.0
+      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.1
      ,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
-      :doc:`RCCL <rccl:index>`,2.26.6,2.26.6,2.22.3
-      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.0
+      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.1,2.0.1,N/A
      ,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,3.0.2,3.0.0,2.4.0
-      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,1.0.0,0.12.0
-      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.20,1.0.18
-      :doc:`hipfort <hipfort:index>`,0.7.0,0.7.0,0.6.0
-      :doc:`hipRAND <hiprand:index>`,3.0.0,3.0.0,2.12.0
-      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,3.0.0,2.4.0
-      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,4.0.1,3.2.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.4,0.2.3
-      :doc:`rocALUTION <rocalution:index>`,4.0.0,4.0.0,3.2.2
-      :doc:`rocBLAS <rocblas:index>`,5.0.2,5.0.0,4.4.0
-      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.34,1.0.32
-      :doc:`rocRAND <rocrand:index>`,4.0.0,4.0.0,3.3.0
-      :doc:`rocSOLVER <rocsolver:index>`,3.30.1,3.30.0,3.28.0
-      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,4.0.2,3.4.0
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0
-      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
+      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.3.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.1,0.10.0
+      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.17
+      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.5.0
+      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.11.0
+      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.3.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.1.2
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.2
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.3,3.2.1
+      :doc:`rocBLAS <rocblas:index>`,4.4.1,4.4.1,4.3.0
+      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.31
+      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.2.0
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.2,3.28.2,3.27.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.6.0
+      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.42.0
      ,,,
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
-      :doc:`hipCUB <hipcub:index>`,4.0.0,4.0.0,3.4.0
-      :doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0
-      :doc:`rocPRIM <rocprim:index>`,4.0.1,4.0.0,3.4.0
-      :doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0
+      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.4.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.1,3.4.1,3.3.0
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0
      ,,,
      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43482
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.0
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43483,6.3.42131
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.3,6.4.2,6.3.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,26.0.2,26.0.0,25.3.0
-      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,1.1.0,0.3.0
+      :doc:`AMD SMI <amdsmi:index>`,25.5.1,25.5.1,24.7.1
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.7.0,7.5.0,7.4.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0
      ,,,
      PERFORMANCE TOOLS,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.2.3,3.1.0
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.1,1.1.0,1.0.0
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70002,2.0.70000,2.0.60400
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0
-      :doc:`ROCTracer <roctracer:index>`,4.1.70002,4.1.70000,4.1.60400
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.1,3.1.1,3.0.0
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.2,1.0.2,0.1.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60403,2.0.60402,2.0.60300
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.5.0
+      :doc:`ROCTracer <roctracer:index>`,4.1.60403,4.1.60402,4.1.60300
      ,,,
      DEVELOPMENT TOOLS,,,
-      :doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0
+      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,18.0.0.24455
      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.3,0.77.2
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.3
      ,,,
      COMPILERS,.. _compilers-support-compatibility-matrix:,,
      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
-      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25381,20.0.0.25314,19.0.0.25133
-      :doc:`llvm-project <llvm-project:index>`,20.0.0.25381,20.0.0.25314,19.0.0.25133
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25381,20.0.0.25314,19.0.0.25133
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25224,19.0.0.25224,18.0.0.24455
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25224,19.0.0.25224,18.0.0.24491
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25224,19.0.0.25224,18.0.0.24491
      ,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51831,7.0.51830,6.4.43482
-      :doc:`HIP <hip:index>`,7.0.51831,7.0.51830,6.4.43482
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43484,6.4.43484,6.3.42131
+      :doc:`HIP <hip:index>`,6.4.43484,6.4.43484,6.3.42131
      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.14.0
+

 .. rubric:: Footnotes

-.. [#rhel-10-702] RHEL 10.0 is not supported on AMD Radeon PRO V620 GPUs.
-.. [#rhel-700] RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
-.. [#ol-700-mi300x] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
-.. [#ol-mi300x] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
-.. [#db-mi300x] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
-.. [#sles-db-700] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
-.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
-.. [#rl-700] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
-.. [#single-node] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
-.. [#mi350x-os] AMD Instinct MI355X (gfx950) and MI350X(gfx950) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and Oracle Linux 9.
-.. [#RDNA-OS-700] **For ROCm 7.0.x** - AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, and RHEL 9.6.
-.. [#rd-v710] **For ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and Azure Linux 3.0.
-.. [#rd-v620] **For ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) GPUs are supported only on Ubuntu 24.04.3 and Ubuntu 22.04.5.
-.. [#mi325x-os] **For ROCm 7.0.x** - AMD Instinct MI325X GPUs (gfx942) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
-.. [#mi300x-os] **For ROCm 7.0.x** - AMD Instinct MI300X GPUs (gfx942) are supported on all listed :ref:`supported_distributions`.
-.. [#mi300A-os] **For ROCm 7.0.x** - AMD Instinct MI300A GPUs (gfx942) are supported only on Ubuntu 24.04, Ubuntu 22.04, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, Debian 12, and Rocky Linux 9.
-.. [#mi200x-os] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) are supported only on Ubuntu 24.04, Ubuntu 22.04, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
-.. [#mi100-os] **For ROCm 7.0.x** - AMD Instinct MI100 GPUs (gfx908) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
-.. [#tf-mi350] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
-.. [#dgl_compat] DGL is supported only on ROCm 6.4.0.
-.. [#llama-cpp_compat] llama.cpp is supported only on ROCm 7.0.0 and ROCm 6.4.x.
-.. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
-.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+.. [#mi300x] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
+.. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+.. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
+.. [#7700XT-OS] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
+.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.


@@ -194,34 +174,28 @@ Use this lookup table to confirm which operating system and kernel versions are
   :widths: 40, 20, 30, 20
   :stub-columns: 1

-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.3, "6.8 [GA], 6.14 [HWE]", 2.39
+   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 GA, 6.11 HWE", 2.39
   ,,
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 [GA], 6.11 [HWE]", 2.39
+   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 GA, 6.8 HWE", 2.35
   ,,
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
-   ,,
-   `Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.0, 6.12.0-55, 2.39
-   ,,
-   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14.0-570, 2.34
+   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14+, 2.34
   ,9.5, 5.14+, 2.34
-   ,9.4, 5.14.0-427, 2.34
+   ,9.4, 5.14+, 2.34
+   ,9.3, 5.14+, 2.34
   ,,
-   `Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0-553, 2.28
+   `Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0+, 2.28
+   ,8.9, 4.18.0, 2.28
   ,,
-   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.40-150700.51, 2.38
+   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.11.0+, 2.38
   ,15 SP6, "6.5.0+, 6.4.0", 2.38
   ,15 SP5, 5.14.21, 2.31
   ,,
-   `Rocky Linux <https://wiki.rockylinux.org/rocky/version/>`_, 9, 5.14.0-570, 2.34
-   ,,
-   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 10, 6.12.0 (UEK), 2.39
-   ,9, 6.12.0 (UEK), 2.34
+   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 9, 5.15.0 (UEK), 2.35
   ,8, 5.15.0 (UEK), 2.28
   ,,
-   `Debian <https://www.debian.org/download>`_,13, 6.12, 2.35
-   ,12, 6.1.0, 2.36
+   `Debian <https://www.debian.org/download>`_,12, 6.1, 2.36
   ,,
-   `Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.92, 2.38
+   `Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.60, 2.38
   ,,

 .. note::
@@ -254,46 +228,24 @@ Expand for full historical view of:

   .. rubric:: Footnotes

-   .. [#rhel-10-702-past-60] RHEL 10.0 is not supported on AMD Radeon PRO V620 GPUs.
-   .. [#rhel-700-past-60] **For ROCm 7.0.x** - RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
-   .. [#ol-700-mi300x-past-60] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI300X, MI350X, and MI355X. Oracle Linux 8 is supported only on AMD Instinct MI300X.
-   .. [#mi300x-past-60] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
-   .. [#db-mi300x-past-60] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
-   .. [#sles-db-700-past-60] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
-   .. [#single-node-past-60] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
-   .. [#az-mi300x-past-60] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
-   .. [#az-mi300x-630-past-60] **Prior ROCm 6.4.0**- Azure Linux 3.0 is supported only on AMD Instinct MI300X GPUs.
-   .. [#rl-700-past-60] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
-   .. [#mi350x-os-past-60] AMD Instinct MI355X (gfx950) and MI350X(gfx950) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and Oracle Linux 9.
-   .. [#RDNA-OS-700-past-60] **For ROCm 7.0.x** AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, and RHEL 9.6.
-   .. [#RDNA-OS-past-60] **Prior ROCm 7.0.0** - Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
-   .. [#rd-v710-past-60] **For ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and Azure Linux 3.0.
-   .. [#rd-v620-past-60] **For ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) is supported only on Ubuntu 24.04.3 and Ubuntu 22.04.5.
-   .. [#mi325x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI325X GPU (gfx942) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
-   .. [#mi300x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI300X GPU (gfx942) is supported on all listed :ref:`supported_distributions`.
-   .. [#mi300A-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI300A GPU (gfx942) is supported only on Ubuntu 24.04, Ubuntu 22.04, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, Debian 12, and Rocky Linux 9.
-   .. [#mi200x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) are supported only on Ubuntu 24.04, Ubuntu 22.04, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
-   .. [#mi100-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI100 GPU (gfx908) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
+   .. [#mi300x-past-60] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
+   .. [#single-node-past-60] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
   .. [#7700XT-OS-past-60] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
   .. [#mi300_624-past-60] **For ROCm 6.2.4** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
   .. [#mi300_622-past-60] **For ROCm 6.2.2** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
   .. [#mi300_621-past-60] **For ROCm 6.2.1** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
   .. [#mi300_620-past-60] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-   .. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
-   .. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
-   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4.
-   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
-   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
-   .. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 series GPUs instead.
-   .. [#verl_compat-past-60] verl is supported only on ROCm 6.2.0.
-   .. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
-   .. [#dgl_compat-past-60] DGL is supported only on ROCm 6.4.0.
-   .. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
-   .. [#taichi_compat-past-60] Taichi is supported only on ROCm 6.3.2.
-   .. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
-   .. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
-   .. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
-   .. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
-   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+   .. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
+   .. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
+   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
+   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
+   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
+   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
+   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
+   .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
+   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   
--- a/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
@@ -1,107 +0,0 @@
-:orphan:
-
-.. meta::
-    :description: FlashInfer deep learning framework compatibility
-    :keywords: GPU, LLM, FlashInfer, compatibility
-
-.. version-set:: rocm_version latest
-
-********************************************************************************
-FlashInfer compatibility
-********************************************************************************
-
-`FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator 
-for Large Language Models (LLMs) that provides high-performance implementation of graphics 
-processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well 
-as advanced performance across diverse scenarios.
-
-FlashInfer features highly efficient attention kernels, load-balanced scheduling, and memory-optimized 
-techniques, while supporting customized attention variants. It’s compatible with ``torch.compile``, and 
-offers high-performance LLM-specific operators, with easy integration through PyTorch, and C++ APIs.
-
-.. note::
-
-  The ROCm port of FlashInfer is under active development, and some features are not yet available. 
-  For the latest feature compatibility matrix, refer to the ``README`` of the 
-  `https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.
-
-Support for the ROCm port of FlashInfer is available as follows:
-
- ROCm support for FlashInfer is hosted in the `https://github.com/ROCm/flashinfer 
-  <https://github.com/ROCm/flashinfer>`__ repository. This location differs from the 
-  `https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`_ 
-  upstream repository.
-
- To install FlashInfer, use the prebuilt :ref:`Docker image <flashinfer-docker-compat>`, 
-  which includes ROCm, FlashInfer, and all required dependencies.
-
-  - See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>` 
-    to install and get started.
-
-  - See the `Installation guide <https://docs.flashinfer.ai/installation.html>`__ 
-    in the upstream FlashInfer documentation.
-
-.. note::
-
-  Flashinfer is supported on ROCm 6.4.1.
-
-Supported devices
-================================================================================
-
-**Officially Supported**: AMD Instinct™ MI300X
-
-
-.. _flashinfer-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-This release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
-In the decode phase, tokens are generated sequentially, with the model predicting each new 
-token based on the previously generated tokens and the input context.
-
-FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense 
-attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
-
-Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm 
-also implements cascade attention from upstream to reduce memory usage. 
-
-For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for examples and best practices to optimize your workloads on AMD GPUs.
-
-.. _flashinfer-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes `ROCm FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the FlashInfer version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
-Click |docker-icon| to view the image on Docker Hub.
-
-.. list-table:: 
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - ROCm
-      - FlashInfer
-      - PyTorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/flashinfer/flashinfer-0.2.5_rocm6.4_ubuntu24.04_py3.12_pytorch2.7/images/sha256-558914838821c88c557fb6d42cfbc1bdb67d79d19759f37c764a9ee801f93313"><i class="fab fa-docker fa-lg"></i> rocm/flashinfer</a>
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
-      - `v0.2.5 <https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.5>`__
-      - `2.7.1 <https://github.com/ROCm/pytorch/releases/tag/v2.7.1>`__
-      - 24.04
-      - `3.12 <https://www.python.org/downloads/release/python-3129/>`__
-
-
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -27,7 +27,7 @@ with ROCm support:
  - Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
    with ROCm and JAX preinstalled.

-  - ROCm JAX repository: `ROCm/rocm-jax <https://github.com/ROCm/rocm-jax>`_
+  - ROCm JAX repository: `ROCm/jax <https://github.com/ROCm/jax>`_

  - See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
    to get started.
@@ -90,15 +90,75 @@ For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.b
 Docker image compatibility
 ================================================================================

-AMD provides preconfigured Docker images with JAX and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/jax>`__ and are the
-recommended way to get started with deep learning with JAX on ROCm.
-For ``jax-community`` images, see `rocm/jax-community
-<https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.
+.. |docker-icon| raw:: html

-To find the right image tag, see the :ref:`JAX on ROCm installation
-documentation <rocm-install-on-linux:jax-docker-support>` for a list of
-available ``rocm/jax`` images.
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories represent the latest JAX version from the official Docker Hub and are validated for
+`ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`_. Click the |docker-icon|
+icon to view the image on Docker Hub.
+
+.. list-table:: JAX Docker image components
+    :header-rows: 1
+
+    * - Docker image
+      - JAX
+      - Linux
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.12/images/sha256-8918fa806a172c1a10eb2f57131eb31b5d7c8fa1656b8729fe7d3d736112de83"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+
+      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
+      - Ubuntu 24.04
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.10/images/sha256-a394be13c67b7fc602216abee51233afd4b6cb7adaa57ca97e688fba82f9ad79"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+
+      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
+      - Ubuntu 22.04
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+
+AMD publishes `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_.
+
+.. list-table:: JAX community Docker image components
+    :header-rows: 1
+
+    * - Docker image
+      - JAX
+      - Linux
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.12.8/images/sha256-25dfaa0183e274bd0a3554a309af3249c6f16a1793226cb5373f418e39d3146a"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
+
+      - `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
+      - Ubuntu 22.04
+      - `3.12.8 <https://www.python.org/downloads/release/python-3128/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.11.11/images/sha256-ff9baeca9067d13e6c279c911e5a9e5beed0817d24fafd424367cc3d5bd381d7"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
+
+      - `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
+      - Ubuntu 22.04
+      - `3.11.11 <https://www.python.org/downloads/release/python-31111/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.10.16/images/sha256-8bab484be1713655f74da51a191ed824bb9d03db1104fd63530a1ac3c37cf7b1"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
+
+      - `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
+      - Ubuntu 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_

 .. _key_rocm_libraries:

@@ -250,54 +310,5 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
  Since version 0.1.56, JAX has full support for ROCm, and the
  :ref:`Known issues and important notes <jax_comp_known_issues>` section
  contains details about limitations specific to the ROCm backend. The list of
-  JAX API modules are maintained by the JAX project and is subject to change.
+  JAX API modules is maintained by the JAX project and is subject to change. 
  Refer to the official Jax documentation for the most up-to-date information.
-
-Key features and enhancements for ROCm 7.0
-===============================================================================
-
- Upgraded XLA backend: Integrates a newer XLA version, enabling better
-  optimizations, broader operator support, and potential performance gains.
-
- RNN support: Native RNN support (including LSTMs via ``jax.experimental.rnn``)
-  now available on ROCm, aiding sequence model development.
-
- Comprehensive linear algebra capabilities: Offers robust ``jax.linalg``
-  operations, essential for scientific and machine learning tasks.
-
- Expanded AMD GPU architecture support: Provides ongoing support for gfx1101
-  GPUs and introduces support for gfx950 and gfx12xx GPUs.
-
- Mixed FP8 precision support: Enables ``lax.dot_general`` operations with mixed FP8
-  types, offering pathways for memory and compute efficiency.
-
- Streamlined PyPi packaging: Provides reliable PyPi wheels for JAX on ROCm,
-  simplifying the installation process.
-
- Pallas experimental kernel development: Continued Pallas framework
-  enhancements for custom GPU kernels, including new intrinsics (specific
-  kernel behaviors under review).
-
- Improved build system and CI: Enhanced ROCm build system and CI for greater
-  reliability and maintainability.
-
- Enhanced distributed computing setup: Improved JAX setup in multi-GPU
-  distributed environments.
-
-.. _jax_comp_known_issues:
-
-Known issues and notes for ROCm 7.0
-===============================================================================
-
- ``nn.dot_product_attention``: Certain configurations of ``jax.nn.dot_product_attention``
-  may cause segmentation faults, though the majority of use cases work correctly.
-
- SVD with dynamic shapes: SVD on inputs with dynamic/symbolic shapes might result in an error.
-  SVD with static shapes is unaffected.
-
- QR decomposition with symbolic shapes: QR decomposition operations may fail when using
-  symbolic/dynamic shapes in shape polymorphic contexts.
-
- Pallas kernels: Specific advanced Pallas kernels may exhibit variations in
-  numerical output or resource usage. These are actively reviewed as part of
-  Pallas's experimental development.
--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -1,275 +0,0 @@
-:orphan:
-
-.. meta::
-    :description: llama.cpp deep learning framework compatibility
-    :keywords: GPU, GGML, llama.cpp compatibility
-
-.. version-set:: rocm_version latest
-
-********************************************************************************
-llama.cpp compatibility
-********************************************************************************
-
-`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework 
-for Large Language Model (LLM) inference that runs on both central processing units 
-(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing 
-a simple, dependency-free setup. 
-
-The framework supports multiple quantization options, from 1.5-bit to 8-bit integers, 
-to accelerate inference and reduce memory usage. Originally built as a CPU-first library, 
-llama.cpp is easy to integrate with other programming environments and is widely 
-adopted across diverse platforms, including consumer devices. 
-
-ROCm support for llama.cpp is upstreamed, and you can build the official source code
-with ROCm support:
-
- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp 
-  <https://github.com/ROCm/llama.cpp>`_ repository.
-
- Due to independent compatibility considerations, this location differs from the 
-  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
-
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`, 
-  which includes ROCm, llama.cpp, and all required dependencies.
-
-  - See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>` 
-    to install and get started.
-
-  - See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__ 
-    in the upstream llama.cpp documentation.
-
-.. note::
-
-  llama.cpp is supported on ROCm 7.0.0 and ROCm 6.4.x.
-
-Supported devices
-================================================================================
-
-**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
-
-
-Use cases and recommendations
-================================================================================
-
-llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
-
- Plain C/C++ implementation with no external dependencies
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
-
-llama.cpp is also used in a range of real-world applications, including:
-
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
-  A simple maze game where AI-controlled agents attempt to trick the player.
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
-  A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
- Various other AI applications use llama.cpp as their inference engine;  
-  for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
-
-For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
-
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__ 
-  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
-  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
-  AMD Instinct GPUs within the ROCm ecosystem. 
-
-.. _llama-cpp-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
-with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories represent the available llama.cpp versions from the official Docker Hub.
-Click |docker-icon| to view the image on Docker Hub.
-
-.. important::
-
-   Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
-
-   - Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
-   - Server: This image only includes the server executable file.
-   - Light: This image only includes the main executable file.
-
-.. list-table::
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Full Docker
-      - Server Docker
-      - Light Docker
-      - llama.cpp
-      - ROCm
-      - Ubuntu
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_full/images/sha256-a2ecd635eaa65bb289a9041330128677f3ae88bee6fee0597424b17e38d4903c"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_server/images/sha256-cb46b47df415addb5ceb6e6fdf0be70bf9d7f6863bbe6e10c2441ecb84246d52"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_light/images/sha256-8f8536eec4b05c0ff1c022f9fc6c527ad1c89e6c1ca0906e4d39e4de73edbde9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_full/images/sha256-f36de2a3b03ae53e81c85422cb3780368c9891e1ac7884b04403a921fe2ea45d"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_server/images/sha256-df15e8ab11a6837cd3736644fec1e047465d49e37d610ab0b79df000371327df"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_light/images/sha256-4ea2d5bb7964f0ee3ea9b30ba7f343edd6ddfab1b1037669ca7eafad2e3c2bd7"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
-      - 22.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_full/images/sha256-5960fc850024a8a76451f9eaadd89b7e59981ae9f393b407310c1ddf18892577"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_server/images/sha256-1b79775d9f546065a6aaf9ca426e1dd4ed4de0b8f6ee83687758cc05af6538e6"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_light/images/sha256-8f863c4c2857ae42bebd64e4f1a0a1e7cc3ec4503f243e32b4a4dcad070ec361"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_full/images/sha256-888879b3ee208f9247076d7984524b8d1701ac72611689e89854a1588bec9867"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_server/images/sha256-90e4ff99a66743e33fd00728cd71a768588e5f5ef355aaa196669fe65ac70672"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_light/images/sha256-bd447a049939cb99054f8fbf3f2352870fe906a75e2dc3339c845c08b9c53f9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
-      - 22.04
-
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_full/images/sha256-5b3a1bc4889c1fcade434b937fbf9cc1c22ff7dc0317c130339b0c9238bc88c4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_server/images/sha256-5228ff99d0f627a9032d668f4381b2e80dc1e301adc3e0821f26d8354b175271"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_light/images/sha256-b12723b332a826a89b7252dddf868cbe4d1a869562fc4aa4032f59e1a683b968"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_full/images/sha256-cd6e21a6a73f59b35dd5309b09dd77654a94d783bf13a55c14eb8dbf8e9c2615"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_server/images/sha256-c2b4689ab2c47e6626e8fea22d7a63eb03d47c0fde9f5ef8c9f158d15c423e58"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_light/images/sha256-1acc28f29ed87db9cbda629cb29e1989b8219884afe05f9105522be929e94da4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
-      - 22.04
-
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_full/images/sha256-2f8ae8a44510d96d52dea6cb398b224f7edeb7802df7ec488c6f63d206b3cdc9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_server/images/sha256-fece497ff9f4a28b12f645de52766941da8ead8471aa1ea84b61d4b4568e51f2"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_light/images/sha256-3e14352fa6f8c6128b23cf9342531c20dbfb522550b626e09d83b260a1947022"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_full/images/sha256-80763062ef0bec15038c35fd01267f1fc99a5dd171d4b48583cc668b15efad69"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_server/images/sha256-db2a6c957555ed83b819bbc54aea884a93192da0fb512dae63d32e0dc4e8ab8f"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_light/images/sha256-c6dbb07cc655fb079d5216e4b77451cb64a9daa0585d23b6fb8b32cb22021197"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
-      - 22.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
-      - 24.04
-
-
-Key ROCm libraries for llama.cpp
-================================================================================
-
-llama.cpp functionality on ROCm is determined by its underlying library
-dependencies. These ROCm components affect the capabilities, performance, and
-feature set available to developers. Ensure you have the required libraries for 
-your corresponding ROCm version.
-
-.. list-table::
-    :header-rows: 1
-
-    * - ROCm library
-      - ROCm 7.0.0 version
-      - ROCm 6.4.x version
-      - Purpose
-      - Usage
-    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
-      - 3.0.0
-      - 2.4.0
-      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
-        matrix and vector operations.
-      - Supports operations such as matrix multiplication, matrix-vector
-        products, and tensor contractions. Utilized in both dense and batched
-        linear algebra operations.
-    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
-      - 1.0.0
-      - 0.12.0
-      - hipBLASLt is an extension of the hipBLAS library, providing additional
-        features like epilogues fused into the matrix multiplication kernel or
-        use of integer tensor cores.
-      - By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
-        kernels where possible.
-    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
-      - 2.0.0
-      - 1.7.0
-      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
-        multiplication (GEMM) and accumulation operations with mixed precision
-        support.
-      - Can be used to enhance the flash attention performance on AMD compute, by enabling
-        the flag during compile time.
-
-Previous versions
-===============================================================================
-See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/llama-cpp-history` to find documentation for previous releases
-of the ``ROCm/llama.cpp`` Docker image.
--- a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
@@ -28,7 +28,7 @@ Supported devices
 ================================================================================

 - **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X

 Supported models and features
 ================================================================================
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -89,13 +89,141 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts <https://ro
 Docker image compatibility
 ================================================================================

-AMD provides preconfigured Docker images with PyTorch and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/pytorch>`__ and are the
-recommended way to get started with deep learning with PyTorch on ROCm.
+.. |docker-icon| raw:: html

-To find the right image tag, see the :ref:`PyTorch on ROCm installation
-documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of
-available ``rocm/pytorch`` images.
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`__
+with ROCm backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: PyTorch Docker image components
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker
+      - PyTorch
+      - Ubuntu
+      - Python
+      - Apex
+      - torchvision
+      - TensorBoard
+      - MAGMA
+      - UCX
+      - OMPI
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-6a287591500b4048a9556c1ecc92bc411fd3d552f6c8233bc399f18eb803e8d6"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
+      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-06b967629ba6657709f04169832cd769a11e6b491e8b1394c361d42d7a0c8b43"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
+      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-62022414217ef6de33ac5b1341e57db8a48e8573fa2ace12d48aa5edd4b99ef0"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.11_pytorch_release_2.5.1/images/sha256-469a7f74fc149aff31797e011ee41978f6a190adc69fa423b3c6a718a77bd985"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+      - 22.04
+      - `3.11 <https://www.python.org/downloads/release/python-31113/>`__
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-37f41a1cd94019688669a1b20d33ea74156e0c129ef6b8270076ef214a6a1a2c"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-60824ba83dc1b9d94164925af1f81c0235c105dd555091ec04c57e05177ead1b"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
+      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-fe944fe083312f901be6891ab4d3ffebf2eaf2cf4f5f0f435ef0b76ec714fabd"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
+      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-1d59251c47170c5b8960d1172a4dbe52f5793d8966edd778f168eaf32d56661a"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`__
+      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`__
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__

 Key ROCm libraries for PyTorch
 ================================================================================
@@ -238,8 +366,7 @@ feature set available to developers.
 Supported modules and data types
 ================================================================================

-The following section outlines the supported data types, modules, and domain
-libraries available in PyTorch on ROCm.
+The following section outlines the supported data types, modules, and domain libraries available in PyTorch on ROCm.

 Supported data types
 --------------------------------------------------------------------------------
@@ -338,7 +465,7 @@ with ROCm.
    * - Library
      - Description

-    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_
+    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_ 
      - Audio and signal processing library for PyTorch. Provides utilities for
        audio I/O, signal and data processing functions, datasets, model
        implementations, and application components for audio and speech
@@ -365,11 +492,11 @@ with ROCm.
        and popular datasets for natural language processing, including
        tokenization, vocabulary management, and text embeddings.

-        **Note:** ``torchtext`` does not implement ROCm-specific kernels.
+        **Note:** ``torchtext`` does not implement ROCm-specific kernels. 
        ROCm acceleration is provided through the underlying PyTorch framework
        and ROCm library integration. Only official release exists.

-    * - `torchdata <https://meta-pytorch.org/data/beta/index.html#torchdata>`_
+    * - `torchdata <https://docs.pytorch.org/data/beta/index.html>`_
      - Beta library of common modular data loading primitives for easily
        constructing flexible and performant data pipelines, with features still
        in prototype stage.
@@ -406,72 +533,3 @@ with ROCm.
        dispatching.

        **Note:** Only official release exists.
-
-Key features and enhancements for PyTorch 2.7 with ROCm 7.0
-================================================================================
-
- Enhanced TunableOp framework: Introduces ``tensorfloat32`` support for
-  TunableOp operations, improved offline tuning for ScaledGEMM operations,
-  submatrix offline tuning capabilities, and better logging for BLAS operations
-  without bias vectors.
-
- Expanded GPU architecture support: Provides optimized support for newer GPU
-  architectures, including gfx1200 and gfx1201 with preferred hipBLASLt backend
-  selection, along with improvements for gfx950 and gfx1100 series GPUs.
-
- Advanced Triton Integration: AOTriton 0.10b introduces official support for
-  gfx950 and gfx1201, along with experimental support for gfx1101, gfx1151,
-  gfx1150, and gfx1200.
-
- Improved element-wise kernel performance: Delivers enhanced vectorized
-  element-wise kernels with better support for heterogeneous tensor types and
-  optimized input vectorization for tensors with mixed data types.
-
- MIOpen deep learning optimizations: Enables NHWC BatchNorm by default on
-  ROCm 7.0+, provides ``maxpool`` forward and backward performance improvements
-  targeting ResNet scenarios, and includes updated launch configurations for
-  better performance.
-
- Enhanced memory and tensor operations: Features fixes for in-place ``aten``
-  sum operations with specialized templated kernels, improved 3D tensor
-  performance with NHWC format, and better handling of memory-bound matrix
-  multiplication operations.
-
- Robust testing and quality improvements: Includes comprehensive test suite
-  updates with improved tolerance handling for Navi3x architectures, generalized
-  ROCm-specific test conditions, and enhanced unit test coverage for Flash
-  Attention and Memory Efficient operations.
-
- Build system and infrastructure improvements: Provides updated CentOS Stream 9
-  support, improved Docker configuration, migration to public MAGMA repository,
-  and enhanced QA automation scripts for PyTorch unit testing.
-
- Composable Kernel (CK) updates: Features updated CK submodule integration with
-  the latest optimizations and performance improvements for core mathematical
-  operations.
-
- Development and debugging enhancements: Includes improved source handling for
-  dynamic compilation, better error handling for atomic operations, and enhanced
-  state checking for trace operations.
-
- Integrate APEX fused layer normalization, which can have positive impact on
-  text-to-video models.
-
- Integrate APEX distributed fused LAMB and distributed fused ADAM, which can
-  have positive impact on BERT-L and Llama2-SFT.
-
- FlashAttention v3 has been integrated for AMD GPUs.
-
- `Pytorch C++ extensions <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_
-  provide a mechanism for compiling custom operations that can be used during
-  network training or inference. For AMD platforms, ``amdclang++`` has been
-  validated as the supported compiler for building these extensions.
-
-Known issues and notes for PyTorch 2.7 with ROCm 7.0
-================================================================================
-
- The ``matmul.allow_fp16_reduced_precision_reduction`` and
-  ``matmul.allow_bf16_reduced_precision_reduction`` options under
-  ``torch.backends.cuda`` are not supported. As a result,
-  reduced-precision reductions using FP16 or BF16 accumulation types are not
-  available.
--- a/docs/compatibility/ml-compatibility/ray-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -1,111 +0,0 @@
-:orphan:
-
-.. meta::
-    :description: Ray deep learning framework compatibility
-    :keywords: GPU, Ray compatibility
-
-.. version-set:: rocm_version latest
-
-*******************************************************************************
-Ray compatibility
-*******************************************************************************
-
-Ray is a unified framework for scaling AI and Python applications from your laptop 
-to a full cluster, without changing your code. Ray consists of `a core distributed 
-runtime  <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of 
-`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for 
-simplifying machine learning computations.
-
-Ray is a general-purpose framework that runs many types of workloads efficiently. 
-Any Python application can be scaled with Ray, without extra infrastructure.
-
-ROCm support for Ray is upstreamed, and you can build the official source code
-with ROCm support: 
-
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray 
-  <https://github.com/ROCm/ray>`_ repository.
-
- Due to independent compatibility considerations, this location differs from the 
-  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
-
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>` 
-  which includes ROCm, Ray, and all required dependencies.
-
-  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
-    for instructions to get started.
-
-  - See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_ 
-    in the upstream Ray documentation.
-
-  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
-    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
-
-.. note::
-
-  Ray is supported on ROCm 6.4.1.
-
-Supported devices
-================================================================================
-
-**Officially Supported**: AMD Instinct™ MI300X, MI210
-
-
-Use cases and recommendations
-================================================================================
-
-* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm 
-  Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__  
-  blog provides an overview of Volcano Engine Reinforcement Learning (verl) 
-  for large language models (LLMs) and discusses its benefits in large-scale 
-  reinforcement learning from human feedback (RLHF). It uses Ray as part of a 
-  hybrid orchestration engine to schedule and coordinate training and inference 
-  tasks in parallel, enabling optimized resource utilization and potential overlap 
-  between these phases. This dynamic resource allocation strategy significantly 
-  improves overall system efficiency. The blog presents verl’s performance results, 
-  focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X 
-  GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and 
-  accelerate your RLHF training with ROCm-optimized performance.
-
-* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows 
-  <https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
-  blog post describes key use cases such as training and inference for large language models (LLMs), 
-  model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale 
-  workloads using Ray in the ROCm environment.
-
-For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support 
-topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__ 
-of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
-
-.. _ray-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
-with ROCm backends on Docker Hub. The following Docker image tags and
-associated inventories represent the latest Ray version from the official Docker Hub and are validated for
-`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
-icon to view the image on Docker Hub.
-
-.. list-table::
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - Ray
-      - Pytorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
-      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
-      - 2.6.0+git684f6f2
-      - 24.04
-      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -27,7 +27,7 @@ Supported Devices
 ================================================================================

 - **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X


 Supported models and features
--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -47,15 +47,80 @@ fixes, updates, and support for the latest ROCM versions.
 .. _tensorflow-docker-compat:

 Docker image compatibility
-================================================================================
+===============================================================================

-AMD provides preconfigured Docker images with TensorFlow and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/tensorflow>`__ and are the
-recommended way to get started with deep learning with TensorFlow on ROCm.
+.. |docker-icon| raw:: html

-To find the right image tag, see the :ref:`TensorFlow on ROCm installation
-documentation <rocm-install-on-linux:tensorflow-docker-support>` for a list of
-available ``rocm/tensorflow`` images.
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `TensorFlow images
+<https://hub.docker.com/r/rocm/tensorflow>`__ with ROCm backends on
+Docker Hub. The following Docker image tags and associated inventories are
+validated for `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__. Click
+the |docker-icon| icon to view the image on Docker Hub.
+
+.. list-table:: TensorFlow Docker image components
+    :header-rows: 1
+
+    * - Docker image
+      - TensorFlow
+      - Ubuntu
+      - Python
+      - TensorBoard
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.18-dev/images/sha256-96754ce2d30f729e19b497279915b5212ba33d5e408e7e5dd3f2304d87e3441e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - 24.04
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.18-dev/images/sha256-fa741508d383858e86985a9efac85174529127408102558ae2e3a4ac894eea1e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.17-dev/images/sha256-3a0aef09f2a8833c2b64b85874dd9449ffc2ad257351857338ff5b706c03a418"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - 24.04
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.17-dev/images/sha256-bc7341a41ebe7ab261aa100732874507c452421ef733e408ac4f05ed453b0bc5"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.16-dev/images/sha256-4841a8df7c340dab79bf9362dad687797649a00d594e0832eb83ea6880a40d3b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - 24.04
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.16-dev/images/sha256-883fa95aba960c58a3e46fceaa18f03ede2c7df89b8e9fd603ab2d47e0852897"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__


 Critical ROCm libraries for TensorFlow
--- a/docs/conceptual/gpu-arch.md
+++ b/docs/conceptual/gpu-arch.md
@@ -21,8 +21,7 @@ architecture.
 * [AMD Instinct™ MI300 microarchitecture](./gpu-arch/mi300.md)
 * [AMD Instinct MI300/CDNA3 ISA](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf)
 * [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf)
-* [MI300 performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
-* [MI350 series performance counters](./gpu-arch/mi350-performance-counters.rst)
+* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
 :::

 :::{grid-item-card}
--- a/docs/conceptual/gpu-arch/mi350-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi350-performance-counters.rst
@@ -1,530 +0,0 @@
-.. meta::
-  :description: MI355 series performance counters and metrics
-  :keywords: MI355, MI355X, MI3XX
-
-***********************************
-MI350 series performance counters
-***********************************
-
-This topic lists and describes the hardware performance counters and derived metrics available on the AMD Instinct MI350 and MI355 accelerators. These counters are available for profiling using `ROCprofiler-SDK <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html>`_ and `ROCm Compute Profiler <https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/>`_.
-
-The following sections list the performance counters based on the IP blocks.
-
-Command processor packet processor counters (CPC)
-==================================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - CPC_ALWAYS_COUNT
-      - Always count.
-
-    * - CPC_ADC_VALID_CHUNK_NOT_AVAIL
-      - ADC valid chunk is not available when dispatch walking is in progress in the multi-xcc mode.
-
-    * - CPC_ADC_DISPATCH_ALLOC_DONE
-      - ADC dispatch allocation is done.
-
-    * - CPC_ADC_VALID_CHUNK_END
-      - ADC crawler's valid chunk end in the multi-xcc mode.
-
-    * - CPC_SYNC_FIFO_FULL_LEVEL
-      - SYNC FIFO full last cycles.
-
-    * - CPC_SYNC_FIFO_FULL
-      - SYNC FIFO full times.
-
-    * - CPC_GD_BUSY
-      - ADC busy.
-
-    * - CPC_TG_SEND
-      - ADC thread group send.
-
-    * - CPC_WALK_NEXT_CHUNK
-      - ADC walking next valid chunk in the multi-xcc mode.
-
-    * - CPC_STALLED_BY_SE0_SPI
-      - ADC CSDATA stalled by SE0SPI.
-
-    * - CPC_STALLED_BY_SE1_SPI
-      - ADC CSDATA stalled by SE1SPI.
-
-    * - CPC_STALLED_BY_SE2_SPI
-      - ADC CSDATA stalled by SE2SPI.
-
-    * - CPC_STALLED_BY_SE3_SPI
-      - ADC CSDATA stalled by SE3SPI.
-
-    * - CPC_LTE_ALL
-      - CPC sync counter LteAll. Only Master XCD manages LteAll.
-
-    * - CPC_SYNC_WRREQ_FIFO_BUSY
-      - CPC sync counter request FIFO is not empty.
-
-    * - CPC_CANE_BUSY
-      - CPC CANE bus is busy, which indicates the presence of inflight sync counter requests.
-
-    * - CPC_CANE_STALL
-      - CPC sync counter sending is stalled by CANE.
-
-Shader pipe interpolators (SPI) counters
-=========================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - SPI_CS0_WINDOW_VALID
-      - Clock count enabled by PIPE0 perfcounter_start event.
-
-    * - SPI_CS0_BUSY
-      - Number of clocks with outstanding waves for PIPE0 (SPI or SH).
-
-    * - SPI_CS0_NUM_THREADGROUPS
-      - Number of thread groups launched for PIPE0.
-
-    * - SPI_CS0_CRAWLER_STALL
-      - Number of clocks when PIPE0 event or wave order FIFO is full.
-
-    * - SPI_CS0_EVENT_WAVE
-      - Number of PIPE0 events and waves.
-
-    * - SPI_CS0_WAVE
-      - Number of PIPE0 waves.
-
-    * - SPI_CS1_WINDOW_VALID
-      - Clock count enabled by PIPE1 perfcounter_start event.
-
-    * - SPI_CS1_BUSY
-      - Number of clocks with outstanding waves for PIPE1 (SPI or SH).
-
-    * - SPI_CS1_NUM_THREADGROUPS
-      - Number of thread groups launched for PIPE1.
-
-    * - SPI_CS1_CRAWLER_STALL
-      - Number of clocks when PIPE1 event or wave order FIFO is full.
-
-    * - SPI_CS1_EVENT_WAVE
-      - Number of PIPE1 events and waves.
-
-    * - SPI_CS1_WAVE
-      - Number of PIPE1 waves.
-
-    * - SPI_CS2_WINDOW_VALID
-      - Clock count enabled by PIPE2 perfcounter_start event.
-
-    * - SPI_CS2_BUSY
-      - Number of clocks with outstanding waves for PIPE2 (SPI or SH).
-
-    * - SPI_CS2_NUM_THREADGROUPS
-      - Number of thread groups launched for PIPE2.
-
-    * - SPI_CS2_CRAWLER_STALL
-      - Number of clocks when PIPE2 event or wave order FIFO is full.
-
-    * - SPI_CS2_EVENT_WAVE
-      - Number of PIPE2 events and waves.
-
-    * - SPI_CS2_WAVE
-      - Number of PIPE2 waves.
-
-    * - SPI_CS3_WINDOW_VALID
-      - Clock count enabled by PIPE3 perfcounter_start event.
-
-    * - SPI_CS3_BUSY
-      - Number of clocks with outstanding waves for PIPE3 (SPI or SH).
-
-    * - SPI_CS3_NUM_THREADGROUPS
-      - Number of thread groups launched for PIPE3.
-
-    * - SPI_CS3_CRAWLER_STALL
-      - Number of clocks when PIPE3 event or wave order FIFO is full.
-
-    * - SPI_CS3_EVENT_WAVE
-      - Number of PIPE3 events and waves.
-
-    * - SPI_CS3_WAVE
-      - Number of PIPE3 waves.
-
-    * - SPI_CSQ_P0_Q0_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue0.
-
-    * - SPI_CSQ_P0_Q1_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue1.
-
-    * - SPI_CSQ_P0_Q2_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue2.
-
-    * - SPI_CSQ_P0_Q3_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue3.
-
-    * - SPI_CSQ_P0_Q4_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue4.
-
-    * - SPI_CSQ_P0_Q5_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue5.
-
-    * - SPI_CSQ_P0_Q6_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue6.
-
-    * - SPI_CSQ_P0_Q7_OCCUPANCY
-      - Sum of occupancy info for PIPE0 Queue7.
-
-    * - SPI_CSQ_P1_Q0_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue0.
-
-    * - SPI_CSQ_P1_Q1_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue1.
-
-    * - SPI_CSQ_P1_Q2_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue2.
-
-    * - SPI_CSQ_P1_Q3_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue3.
-
-    * - SPI_CSQ_P1_Q4_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue4.
-
-    * - SPI_CSQ_P1_Q5_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue5.
-
-    * - SPI_CSQ_P1_Q6_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue6.
-
-    * - SPI_CSQ_P1_Q7_OCCUPANCY
-      - Sum of occupancy info for PIPE1 Queue7.
-
-    * - SPI_CSQ_P2_Q0_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue0.
-
-    * - SPI_CSQ_P2_Q1_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue1.
-
-    * - SPI_CSQ_P2_Q2_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue2.
-
-    * - SPI_CSQ_P2_Q3_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue3.
-
-    * - SPI_CSQ_P2_Q4_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue4.
-
-    * - SPI_CSQ_P2_Q5_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue5.
-
-    * - SPI_CSQ_P2_Q6_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue6.
-
-    * - SPI_CSQ_P2_Q7_OCCUPANCY
-      - Sum of occupancy info for PIPE2 Queue7.
-
-    * - SPI_CSQ_P3_Q0_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue0.
-
-    * - SPI_CSQ_P3_Q1_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue1.
-
-    * - SPI_CSQ_P3_Q2_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue2.
-
-    * - SPI_CSQ_P3_Q3_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue3.
-
-    * - SPI_CSQ_P3_Q4_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue4.
-
-    * - SPI_CSQ_P3_Q5_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue5.
-
-    * - SPI_CSQ_P3_Q6_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue6.
-
-    * - SPI_CSQ_P3_Q7_OCCUPANCY
-      - Sum of occupancy info for PIPE3 Queue7.
-
-    * - SPI_CSQ_P0_OCCUPANCY
-      - Sum of occupancy info for all PIPE0 queues.
-
-    * - SPI_CSQ_P1_OCCUPANCY
-      - Sum of occupancy info for all PIPE1 queues.
-
-    * - SPI_CSQ_P2_OCCUPANCY
-      - Sum of occupancy info for all PIPE2 queues.
-
-    * - SPI_CSQ_P3_OCCUPANCY
-      - Sum of occupancy info for all PIPE3 queues.
-
-    * - SPI_VWC0_VDATA_VALID_WR
-      - Number of clocks VGPR bus_0 writes VGPRs.
-
-    * - SPI_VWC1_VDATA_VALID_WR
-      - Number of clocks VGPR bus_1 writes VGPRs.
-
-    * - SPI_CSC_WAVE_CNT_BUSY
-      - Number of cycles when there is any wave in the pipe.
-
-Compute unit (SQ) counters
-===========================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - SQ_INSTS_VALU_MFMA_F6F4
-      - Number of VALU V_MFMA_*_F6F4 instructions.
-
-    * - SQ_INSTS_VALU_MFMA_MOPS_F6F4
-      - Number of VALU matrix with the performed math operations (add or mul) divided by 512, assuming a full EXEC mask of F6 or F4 data type.
-
-    * - SQ_ACTIVE_INST_VALU2
-      - Number of quad-cycles when two VALU instructions are issued (per-simd, nondeterministic).
-
-    * - SQ_INSTS_LDS_LOAD
-      - Number of LDS load instructions issued (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_STORE
-      - Number of LDS store instructions issued (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_ATOMIC
-      - Number of LDS atomic instructions issued (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_LOAD_BANDWIDTH
-      - Total number of 64-bytes loaded (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_STORE_BANDWIDTH
-      - Total number of 64-bytes written (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
-
-    * - SQ_INSTS_LDS_ATOMIC_BANDWIDTH
-      - Total number of 64-bytes atomic (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
-
-    * - SQ_INSTS_VALU_FLOPS_FP16
-      - Counts FLOPS per instruction on float 16 excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP32
-      - Counts FLOPS per instruction on float 32 excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP64
-      - Counts FLOPS per instruction on float 64 excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP16_TRANS
-      - Counts FLOPS per instruction on float 16 trans excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP32_TRANS
-      - Counts FLOPS per instruction on float 32 trans excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_FLOPS_FP64_TRANS
-      - Counts FLOPS per instruction on float 64 trans excluding MFMA/SMFMA.
-
-    * - SQ_INSTS_VALU_IOPS
-      - Counts OPS per instruction on integer or unsigned or bit data (per-simd, emulated).
-
-    * - SQ_LDS_DATA_FIFO_FULL
-      - Number of cycles LDS data FIFO is full (nondeterministic, unwindowed).
-
-    * - SQ_LDS_CMD_FIFO_FULL
-      - Number of cycles LDS command FIFO is full (nondeterministic, unwindowed).
-
-    * - SQ_VMEM_TA_ADDR_FIFO_FULL
-      - Number of cycles texture requests are stalled due to full address FIFO in TA (nondeterministic, unwindowed).
-
-    * - SQ_VMEM_TA_CMD_FIFO_FULL
-      - Number of cycles texture requests are stalled due to full cmd FIFO in TA (nondeterministic, unwindowed).
-
-    * - SQ_VMEM_WR_TA_DATA_FIFO_FULL
-      - Number of cycles texture writes are stalled due to full data FIFO in TA (nondeterministic, unwindowed).
-
-    * - SQC_ICACHE_MISSES_DUPLICATE
-      - Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
-
-    * - SQC_DCACHE_MISSES_DUPLICATE
-      - Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
-
-Texture addressing (TA) unit counters
-======================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - TA_BUFFER_READ_LDS_WAVEFRONTS
-      - Number of buffer read wavefronts for LDS return processed by the TA.
-
-    * - TA_FLAT_READ_LDS_WAVEFRONTS
-      - Number of flat opcode reads for LDS return processed by the TA.
-
-Texture data (TD) unit counters
-================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - TD_WRITE_ACKT_WAVEFRONT
-      - Number of write acknowledgments, sent to SQ and not to SP.
-
-    * - TD_TD_SP_TRAFFIC
-      - Number of times this TD sends data to the SP.
-
-Texture cache per pipe (TCP) counters
-======================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - TCP_TCP_TA_ADDR_STALL_CYCLES
-      - TCP stalls TA addr interface.
-
-    * - TCP_TCP_TA_DATA_STALL_CYCLES
-      - TCP stalls TA data interface. Now windowed.
-
-    * - TCP_LFIFO_STALL_CYCLES
-      - Memory latency FIFOs full stall.
-
-    * - TCP_RFIFO_STALL_CYCLES
-      - Memory Request FIFOs full stall.
-
-    * - TCP_TCR_RDRET_STALL
-      - Write into cache stalled by read return from TCR.
-
-    * - TCP_PENDING_STALL_CYCLES
-      - Stall due to data pending from L2.
-
-    * - TCP_UTCL1_SERIALIZATION_STALL
-      - Total number of stalls caused due to serializing translation requests through the UTCL1.
-
-    * - TCP_UTCL1_THRASHING_STALL
-      - Stall caused by thrashing feature in any probe. Lacks accuracy when the stall signal overlaps between probe0 and probe1, which is worse with MECO of thrashing deadlock. Some probe0 events could miss being counted in with MECO on. This perf count provides a rough thrashing estimate.
-
-    * - TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS
-      - Translation miss_under_miss.
-
-    * - TCP_UTCL1_STALL_INFLIGHT_MAX
-      - Total UTCL1 stalls due to inflight counter saturation.
-
-    * - TCP_UTCL1_STALL_LRU_INFLIGHT
-      - Total UTCL1 stalls due to LRU cache line with inflight traffic.
-
-    * - TCP_UTCL1_STALL_MULTI_MISS
-      - Total UTCL1 stalls due to arbitrated multiple misses.
-
-    * - TCP_UTCL1_LFIFO_FULL
-      - Total UTCL1 and UTCL2 latency, which hides FIFO full cycles.
-
-    * - TCP_UTCL1_STALL_LFIFO_NOT_RES
-      - Total UTCL1 stalls due to UTCL2 latency, which hides FIFO output (not resident).
-
-    * - TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS
-      - Total UTCL1 stalls due to UTCL2_req being out of credits.
-
-    * - TCP_CLIENT_UTCL1_INFLIGHT
-      - The sum of inflight client to UTCL1 requests per cycle.
-
-    * - TCP_TAGRAM0_REQ
-      - Total L2 requests mapping to TagRAM 0 from this TCP to all TCCs.
-
-    * - TCP_TAGRAM1_REQ
-      - Total L2 requests mapping to TagRAM 1 from this TCP to all TCCs.
-
-    * - TCP_TAGRAM2_REQ
-      - Total L2 requests mapping to TagRAM 2 from this TCP to all TCCs.
-
-    * - TCP_TAGRAM3_REQ
-      - Total L2 requests mapping to TagRAM 3 from this TCP to all TCCs.
-
-    * - TCP_TCP_LATENCY
-      - Total TCP wave latency (from the first clock of wave entering to the first clock of wave leaving). Divide by TA_TCP_STATE_READ to find average wave latency.
-
-    * - TCP_TCC_READ_REQ_LATENCY
-      - Total TCP to TCC request latency for reads and atomics with return. Not Windowed.
-
-    * - TCP_TCC_WRITE_REQ_LATENCY
-      - Total TCP to TCC request latency for writes and atomics without return. Not Windowed.
-
-    * - TCP_TCC_WRITE_REQ_HOLE_LATENCY
-      - Total TCP req to TCC hole latency for writes and atomics. Not Windowed.
-
-Texture cache per channel (TCC) counters
-=========================================
-
-.. list-table::
-    :header-rows: 1
-
-    * - Hardware counter
-      - Definition
-
-    * - TCC_READ_SECTORS
-      - Total number of 32B data sectors in read requests.
-
-    * - TCC_WRITE_SECTORS
-      - Total number of 32B data sectors in write requests.
-
-    * - TCC_ATOMIC_SECTORS
-      - Total number of 32B data sectors in atomic requests.
-
-    * - TCC_BYPASS_REQ
-      - Number of bypass requests. This is measured at the tag block.
-
-    * - TCC_LATENCY_FIFO_FULL
-      - Number of cycles when the latency FIFO is full.
-
-    * - TCC_SRC_FIFO_FULL
-      - Number of cycles when the SRC FIFO is assumed to be full as measured at the IB block.
-
-    * - TCC_EA0_RDREQ_64B
-      - Number of 64-byte TCC/EA read requests.
-
-    * - TCC_EA0_RDREQ_128B
-      - Number of 128-byte TCC/EA read requests.
-
-    * - TCC_IB_REQ
-      - Number of requests through the IB. This measures the number of raw requests from graphics clients to this TCC.
-
-    * - TCC_IB_STALL
-      - Number of cycles when the IB output is stalled.
-
-    * - TCC_EA0_WRREQ_WRITE_DRAM
-      - Number of TCC/EA write requests (32-byte or 64-byte) destined for DRAM (MC).
-
-    * - TCC_EA0_WRREQ_ATOMIC_DRAM
-      - Number of TCC/EA atomic requests (32-byte or 64-byte) destined for DRAM (MC).
-
-    * - TCC_EA0_RDREQ_DRAM_32B
-      - Number of 32-byte TCC/EA read requests due to DRAM traffic. One 64-byte request is counted as two and one 128-byte as four.
-
-    * - TCC_EA0_RDREQ_GMI_32B
-      - Number of 32-byte TCC/EA read requests due to GMI traffic. One 64-byte request is counted as two and one 128-byte as four.
-
-    * - TCC_EA0_RDREQ_IO_32B
-      - Number of 32-byte TCC/EA read requests due to IO traffic. One 64-byte request is counted as two and one 128-byte as four.
-
-    * - TCC_EA0_WRREQ_WRITE_DRAM_32B
-      - Number of 32-byte TCC/EA write requests due to DRAM traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_ATOMIC_DRAM_32B
-      - Number of 32-byte TCC/EA atomic requests due to DRAM traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_WRITE_GMI_32B
-      - Number of 32-byte TCC/EA write requests due to GMI traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_ATOMIC_GMI_32B
-      - Number of 32-byte TCC/EA atomic requests due to GMI traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_WRITE_IO_32B
-      - Number of 32-byte TCC/EA write requests due to IO traffic. One 64-byte request is counted as two.
-
-    * - TCC_EA0_WRREQ_ATOMIC_IO_32B
-      - Number of 32-byte TCC/EA atomic requests due to IO traffic. One 64-byte request is counted as two.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -89,15 +89,15 @@ project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "7.0.2"
-release = "7.0.2"
+version = "6.4.3"
+release = "6.4.3"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-10-10"},
+    {"file": "about/release-notes", "os": ["linux"], "date": "2025-08-07"},
    {"file": "release/changelog", "os": ["linux"],},
    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
@@ -108,17 +108,11 @@ article_pages = [
    {"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
-    {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
-    {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
-    {"file": "compatibility/ml-compatibility/flashinfer-compatibility", "os": ["linux"]},
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/multi-node-setup", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/prerequisite-system-validation", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/system-health-check", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
@@ -130,24 +124,14 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
@@ -172,8 +156,6 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

--- a/docs/data/about/compatibility/floating-point-data-types.png
+++ b/docs/data/about/compatibility/floating-point-data-types.png
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
@@ -1,91 +0,0 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
-      rocm_version: 6.4.1
-      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
-      pytorch_version: 2.7.0+gitf717b2a
-      hipblaslt_version: 0.15
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-      - model: Llama 3.1 70B
-        mad_tag: pyt_vllm_llama-3.1-70b
-        model_repo: meta-llama/Llama-3.1-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-        precision: float16
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 70B FP8
-        mad_tag: pyt_vllm_llama-3.1-70b_fp8
-        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-    - group: Mistral AI
-      tag: mistral
-      models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-    - group: Qwen
-      tag: qwen
-      models:
-      - model: QwQ-32B
-        mad_tag: pyt_vllm_qwq-32b
-        model_repo: Qwen/QwQ-32B
-        url: https://huggingface.co/Qwen/QwQ-32B
-        precision: float16
-      - model: Qwen3 30B A3B
-        mad_tag: pyt_vllm_qwen3-30b-a3b
-        model_repo: Qwen/Qwen3-30B-A3B
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
-        precision: float16
-    - group: Microsoft Phi
-      tag: phi
-      models:
-      - model: Phi-4
-        mad_tag: pyt_vllm_phi-4
-        model_repo: microsoft/phi-4
-        url: https://huggingface.co/microsoft/phi-4
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
@@ -1,188 +0,0 @@
-dockers:
-  - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
-    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
-    components:
-      ROCm: 6.4.1
-      vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
-      PyTorch: 2.7.0+gitf717b2a
-      hipBLASLt: 0.15
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_vllm_llama-3.1-8b
-      model_repo: meta-llama/Llama-3.1-8B-Instruct
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: float16
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 70B
-      mad_tag: pyt_vllm_llama-3.1-70b
-      model_repo: meta-llama/Llama-3.1-70B-Instruct
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 405B
-      mad_tag: pyt_vllm_llama-3.1-405b
-      model_repo: meta-llama/Llama-3.1-405B-Instruct
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 2 70B
-      mad_tag: pyt_vllm_llama-2-70b
-      model_repo: meta-llama/Llama-2-70b-chat-hf
-      url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 4096
-        max_num_batched_tokens: 4096
-        max_model_len: 4096
-    - model: Llama 3.1 8B FP8
-      mad_tag: pyt_vllm_llama-3.1-8b_fp8
-      model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-      url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-      precision: float8
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 70B FP8
-      mad_tag: pyt_vllm_llama-3.1-70b_fp8
-      model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-      url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 405B FP8
-      mad_tag: pyt_vllm_llama-3.1-405b_fp8
-      model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-      url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-  - group: Mistral AI
-    tag: mistral
-    models:
-    - model: Mixtral MoE 8x7B
-      mad_tag: pyt_vllm_mixtral-8x7b
-      model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-      url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 32768
-        max_num_batched_tokens: 32768
-        max_model_len: 8192
-    - model: Mixtral MoE 8x22B
-      mad_tag: pyt_vllm_mixtral-8x22b
-      model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-      url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 65536
-        max_num_batched_tokens: 65536
-        max_model_len: 8192
-    - model: Mixtral MoE 8x7B FP8
-      mad_tag: pyt_vllm_mixtral-8x7b_fp8
-      model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-      url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 32768
-        max_num_batched_tokens: 32768
-        max_model_len: 8192
-    - model: Mixtral MoE 8x22B FP8
-      mad_tag: pyt_vllm_mixtral-8x22b_fp8
-      model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-      url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 65536
-        max_num_batched_tokens: 65536
-        max_model_len: 8192
-  - group: Qwen
-    tag: qwen
-    models:
-    - model: QwQ-32B
-      mad_tag: pyt_vllm_qwq-32b
-      model_repo: Qwen/QwQ-32B
-      url: https://huggingface.co/Qwen/QwQ-32B
-      precision: float16
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Qwen3 30B A3B
-      mad_tag: pyt_vllm_qwen3-30b-a3b
-      model_repo: Qwen/Qwen3-30B-A3B
-      url: https://huggingface.co/Qwen/Qwen3-30B-A3B
-      precision: float16
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 32768
-        max_num_batched_tokens: 32768
-        max_model_len: 8192
-  - group: Microsoft Phi
-    tag: phi
-    models:
-    - model: Phi-4
-      mad_tag: pyt_vllm_phi-4
-      model_repo: microsoft/phi-4
-      url: https://huggingface.co/microsoft/phi-4
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 16384
-        max_num_batched_tokens: 16384
-        max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
--- a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
@@ -1,16 +1,17 @@
-dockers:
-  - pull_tag: lmsysorg/sglang:v0.4.5-rocm630
-    docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
-    components:
-      ROCm: 6.3.0
-      SGLang: 0.4.5 (0.4.5-rocm)
-      PyTorch: 2.6.0a0+git8d4926e
-model_groups:
-  - group: DeepSeek
-    tag: deepseek
-    models:
-    - model: DeepSeek-R1-Distill-Qwen-32B
-      mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
-      model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-      url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-      precision: bfloat16
+sglang_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: lmsysorg/sglang:v0.4.5-rocm630
+      docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
+      rocm_version: 6.3.0
+      sglang_version: 0.4.5 (0.4.5-rocm)
+      pytorch_version: 2.6.0a0+git8d4926e
+  model_groups:
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek-R1-Distill-Qwen-32B
+        mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
+        model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+        precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
@@ -1,32 +0,0 @@
-dockers:
-  - pull_tag: lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
-    docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.5.2rc1-rocm700-mi30x/images/sha256-10c4ee502ddba44dd8c13325e6e03868bfe7f43d23d0a44780a8ee8b393f4729
-    components:
-      ROCm: 7.0.0
-      SGLang: v0.5.2rc1
-      pytorch-triton-rocm: 3.4.0+rocm7.0.0.gitf9e5bf54
-model_groups:
-  - group: Dense models
-    tag: dense-models
-    models:
-      - model: Llama 3.1 8B Instruct
-        model_repo: Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
-      - model: Llama 3.1 405B FP8 KV
-        model_repo: Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-      - model: Llama 3.3 70B FP8 KV
-        model_repo: amd-Llama-3.3-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
-      - model: Qwen3 32B
-        model_repo: Qwen3-32B
-        url: https://huggingface.co/Qwen/Qwen3-32B
-  - group: Small experts models
-    tag: small-experts-models
-    models:
-      - model: DeepSeek V3
-        model_repo: DeepSeek-V3
-        url: https://huggingface.co/deepseek-ai/DeepSeek-V3
-      - model: Mixtral 8x7B v0.1
-        model_repo: Mixtral-8x7B-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,316 +1,88 @@
-dockers:
-  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
-    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
-    components:
-      ROCm: 7.0.0
-      vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
-      PyTorch: 2.9.0a0+git1c57644
-      hipBLASLt: 1.0.0
-    dockerfile:
-      commit: 790d22168820507f3105fef29596549378cfe399
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 4096
-          max_model_len: 4096
+vllm_benchmark:
+  unified_docker:
+    latest:
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
+      rocm_version: 6.4.1
+      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
      - model: Llama 3.1 8B
        mad_tag: pyt_vllm_llama-3.1-8b
        model_repo: meta-llama/Llama-3.1-8B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-8B
        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
      - model: Llama 3.1 405B
        mad_tag: pyt_vllm_llama-3.1-405b
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
      - model: Llama 3.1 405B FP8
        mad_tag: pyt_vllm_llama-3.1-405b_fp8
        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B MXFP4
-        mad_tag: pyt_vllm_llama-3.1-405b_fp4
-        model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
-        precision: float4
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B
-        mad_tag: pyt_vllm_llama-3.3-70b
-        model_repo: meta-llama/Llama-3.3-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B FP8
-        mad_tag: pyt_vllm_llama-3.3-70b_fp8
-        model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B MXFP4
-        mad_tag: pyt_vllm_llama-3.3-70b_fp4
-        model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-        precision: float4
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 4 Scout 17Bx16E
-        mad_tag: pyt_vllm_llama-4-scout-17b-16e
-        model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
-        url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Llama 4 Maverick 17Bx128E
-        mad_tag: pyt_vllm_llama-4-maverick-17b-128e
-        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Llama 4 Maverick 17Bx128E FP8
-        mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
-        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek R1 0528 FP8
-        mad_tag: pyt_vllm_deepseek-r1
-        model_repo: deepseek-ai/DeepSeek-R1-0528
-        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_seqs: 1024
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-  - group: OpenAI GPT OSS
-    tag: gpt-oss
-    models:
-      - model: GPT OSS 20B
-        mad_tag: pyt_vllm_gpt-oss-20b
-        model_repo: openai/gpt-oss-20b
-        url: https://huggingface.co/openai/gpt-oss-20b
-        precision: bfloat16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 8192
-          max_model_len: 8192
-      - model: GPT OSS 120B
-        mad_tag: pyt_vllm_gpt-oss-120b
-        model_repo: openai/gpt-oss-120b
-        url: https://huggingface.co/openai/gpt-oss-120b
-        precision: bfloat16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 8192
-          max_model_len: 8192
-  - group: Mistral AI
-    tag: mistral
-    models:
+    - group: Mistral AI
+      tag: mistral
+      models:
      - model: Mixtral MoE 8x7B
        mad_tag: pyt_vllm_mixtral-8x7b
        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
      - model: Mixtral MoE 8x22B
        mad_tag: pyt_vllm_mixtral-8x22b
        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 65536
-          max_model_len: 8192
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
      - model: Mixtral MoE 8x22B FP8
        mad_tag: pyt_vllm_mixtral-8x22b_fp8
        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 65536
-          max_model_len: 8192
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen3 8B
-        mad_tag: pyt_vllm_qwen3-8b
-        model_repo: Qwen/Qwen3-8B
-        url: https://huggingface.co/Qwen/Qwen3-8B
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 32B
-        mad_tag: pyt_vllm_qwen3-32b
-        model_repo: Qwen/Qwen3-32b
-        url: https://huggingface.co/Qwen/Qwen3-32B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 30B A3B
-        mad_tag: pyt_vllm_qwen3-30b-a3b
-        model_repo: Qwen/Qwen3-30B-A3B
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 30B A3B FP8
-        mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
-        model_repo: Qwen/Qwen3-30B-A3B-FP8
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 235B A22B
-        mad_tag: pyt_vllm_qwen3-235b-a22b
-        model_repo: Qwen/Qwen3-235B-A22B
-        url: https://huggingface.co/Qwen/Qwen3-235B-A22B
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 235B A22B FP8
-        mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
-        model_repo: Qwen/Qwen3-235B-A22B-FP8
-        url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-  - group: Microsoft Phi
-    tag: phi
-    models:
+        tunableop: true
+    - group: Microsoft Phi
+      tag: phi
+      models:
      - model: Phi-4
        mad_tag: pyt_vllm_phi-4
        model_repo: microsoft/phi-4
        url: https://huggingface.co/microsoft/phi-4
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 16384
-          max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -1,72 +0,0 @@
-dockers:
-  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
-    components:
-      ROCm: 6.4.1
-      JAX: 0.6.0
-      Python: 3.10.12
-      Transformer Engine: 2.1.0+90d703dd
-      hipBLASLt: 1.1.0-499ece1c21
-  - pull_tag: rocm/jax-training:maxtext-v25.7
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
-    components:
-      ROCm: 6.4.1
-      JAX: 0.5.0
-      Python: 3.10.12
-      Transformer Engine: 2.1.0+90d703dd
-      hipBLASLt: 1.x.x
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: jax_maxtext_train_llama-3.3-70b
-        model_repo: Llama-3.3-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 8B
-        mad_tag: jax_maxtext_train_llama-3.1-8b
-        model_repo: Llama-3.1-8B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 70B
-        mad_tag: jax_maxtext_train_llama-3.1-70b
-        model_repo: Llama-3.1-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3 8B
-        mad_tag: jax_maxtext_train_llama-3-8b
-        multinode_training_script: llama3_8b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3 70B
-        mad_tag: jax_maxtext_train_llama-3-70b
-        multinode_training_script: llama3_70b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 2 7B
-        mad_tag: jax_maxtext_train_llama-2-7b
-        model_repo: Llama-2-7B
-        precision: bf16
-        multinode_training_script: llama2_7b_multinode.sh
-        doc_options: ["single-node", "multi-node"]
-      - model: Llama 2 70B
-        mad_tag: jax_maxtext_train_llama-2-70b
-        model_repo: Llama-2-70B
-        precision: bf16
-        multinode_training_script: llama2_70b_multinode.sh
-        doc_options: ["single-node", "multi-node"]
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V2-Lite (16B)
-        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
-        model_repo: DeepSeek-V2-lite
-        precision: bf16
-        doc_options: ["single-node"]
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: jax_maxtext_train_mixtral-8x7b
-        model_repo: Mixtral-8x7B
-        precision: bf16
-        doc_options: ["single-node"]
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,12 +1,13 @@
 dockers:
-  - pull_tag: rocm/megatron-lm:v25.8_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
    components:
-      ROCm: 6.4.3
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      hipBLASLt: d1b517fc7a
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
      Triton: 3.3.0
      RCCL: 2.22.3
 model_groups:
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.7-benchmark-models.yaml
@@ -1,49 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.7_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
-    components:
-      ROCm: 6.4.2
-      Primus: v0.1.0-rc1
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+ba586519
-      hipBLASLt: 37ba1d36
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
-      - model: Llama 3.1 8B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
-      - model: Llama 3.1 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
-      - model: Llama 3.1 70B (proxy)
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
-      - model: Llama 2 7B
-        mad_tag: pyt_megatron_lm_train_llama-2-7b
-      - model: Llama 2 70B
-        mad_tag: pyt_megatron_lm_train_llama-2-70b
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
-      - model: DeepSeek-V2-Lite
-        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
-      - model: Qwen 2.5 72B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
@@ -1,58 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.7_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
-    components:
-      ROCm: 6.4.2
-      Primus: v0.1.0-rc1
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+ba586519
-      hipBLASLt: 37ba1d36
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
-        config_name: llama3.3_70B-pretrain.yaml
-      - model: Llama 3.1 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
-        config_name: llama3.1_70B-pretrain.yaml
-      - model: Llama 3.1 8B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
-        config_name: llama3.1_8B-pretrain.yaml
-      - model: Llama 2 7B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
-        config_name: llama2_7B-pretrain.yaml
-      - model: Llama 2 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
-        config_name: llama2_70B-pretrain.yaml
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
-        config_name: deepseek_v3-pretrain.yaml
-      - model: DeepSeek-V2-Lite
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
-        config_name: deepseek_v2_lite-pretrain.yaml
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
-        config_name: mixtral_8x7B_v0.1-pretrain.yaml
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
-        config_name: mixtral_8x22B_v0.1-pretrain.yaml
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
-        config_name: primus_qwen2.5_7B-pretrain.yaml
-      - model: Qwen 2.5 72B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
-        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
@@ -1,120 +0,0 @@
-unified_docker:
-  latest:
-    pull_tag: rocm/pytorch-training:v25.6
-    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
-    rocm_version: 6.4.1
-    pytorch_version: 2.8.0a0+git7d205b2
-    python_version: 3.10.17
-    transformer_engine_version: 1.14.0+2f85f5f2
-    flash_attention_version: 3.0.0.post1
-    hipblaslt_version: 0.15.0-8c6919d
-    triton_version: 3.3.0
-model_groups:
-  - group: Pre-training
-    tag: pre-training
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain]
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: Fine-tuning
-    tag: fine-tuning
-    models:
-    - model: Llama 4 Scout 17B-16E
-      mad_tag: pyt_train_llama-4-scout-17b-16e
-      model_repo: Llama-4-17B_16E
-      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.3 70B
-      mad_tag: pyt_train_llama-3.3-70b
-      model_repo: Llama-3.3-70B
-      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 3.2 1B
-      mad_tag: pyt_train_llama-3.2-1b
-      model_repo: Llama-3.2-1B
-      url: https://huggingface.co/meta-llama/Llama-3.2-1B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 3B
-      mad_tag: pyt_train_llama-3.2-3b
-      model_repo: Llama-3.2-3B
-      url: https://huggingface.co/meta-llama/Llama-3.2-3B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 Vision 11B
-      mad_tag: pyt_train_llama-3.2-vision-11b
-      model_repo: Llama-3.2-Vision-11B
-      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.2 Vision 90B
-      mad_tag: pyt_train_llama-3.2-vision-90b
-      model_repo: Llama-3.2-Vision-90B
-      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 3.1 405B
-      mad_tag: pyt_train_llama-3.1-405b
-      model_repo: Llama-3.1-405B
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B
-      precision: BF16
-      training_modes: [finetune_qlora, HF_finetune_lora]
-    - model: Llama 3 8B
-      mad_tag: pyt_train_llama-3-8b
-      model_repo: Llama-3-8B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3 70B
-      mad_tag: pyt_train_llama-3-70b
-      model_repo: Llama-3-70B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 7B
-      mad_tag: pyt_train_llama-2-7b
-      model_repo: Llama-2-7B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 2 13B
-      mad_tag: pyt_train_llama-2-13b
-      model_repo: Llama-2-13B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 70B
-      mad_tag: pyt_train_llama-2-70b
-      model_repo: Llama-2-70B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml
@@ -1,162 +0,0 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.7
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
-    components:
-      ROCm: 6.4.2
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+94e53dd8
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-4b9a52edfc
-      Triton: 3.3.0
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 4 Scout 17B-16E
-      mad_tag: pyt_train_llama-4-scout-17b-16e
-      model_repo: Llama-4-17B_16E
-      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.3 70B
-      mad_tag: pyt_train_llama-3.3-70b
-      model_repo: Llama-3.3-70B
-      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 3.2 1B
-      mad_tag: pyt_train_llama-3.2-1b
-      model_repo: Llama-3.2-1B
-      url: https://huggingface.co/meta-llama/Llama-3.2-1B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 3B
-      mad_tag: pyt_train_llama-3.2-3b
-      model_repo: Llama-3.2-3B
-      url: https://huggingface.co/meta-llama/Llama-3.2-3B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 Vision 11B
-      mad_tag: pyt_train_llama-3.2-vision-11b
-      model_repo: Llama-3.2-Vision-11B
-      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.2 Vision 90B
-      mad_tag: pyt_train_llama-3.2-vision-90b
-      model_repo: Llama-3.2-Vision-90B
-      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora]
-    - model: Llama 3.1 405B
-      mad_tag: pyt_train_llama-3.1-405b
-      model_repo: Llama-3.1-405B
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B
-      precision: BF16
-      training_modes: [finetune_qlora]
-    - model: Llama 3 8B
-      mad_tag: pyt_train_llama-3-8b
-      model_repo: Llama-3-8B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3 70B
-      mad_tag: pyt_train_llama-3-70b
-      model_repo: Llama-3-70B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 7B
-      mad_tag: pyt_train_llama-2-7b
-      model_repo: Llama-2-7B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 2 13B
-      mad_tag: pyt_train_llama-2-13b
-      model_repo: Llama-2-13B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 70B
-      mad_tag: pyt_train_llama-2-70b
-      model_repo: Llama-2-70B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora]
-  - group: OpenAI
-    tag: openai
-    models:
-    - model: GPT OSS 20B
-      mad_tag: pyt_train_gpt_oss_20b
-      model_repo: GPT-OSS-20B
-      url: https://huggingface.co/openai/gpt-oss-20b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-    - model: GPT OSS 120B
-      mad_tag: pyt_train_gpt_oss_120b
-      model_repo: GPT-OSS-120B
-      url: https://huggingface.co/openai/gpt-oss-120b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-  - group: Qwen
-    tag: qwen
-    models:
-    - model: Qwen 3 8B
-      mad_tag: pyt_train_qwen3-8b
-      model_repo: Qwen3-8B
-      url: https://huggingface.co/Qwen/Qwen3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 3 32B
-      mad_tag: pyt_train_qwen3-32b
-      model_repo: Qwen3-32
-      url: https://huggingface.co/Qwen/Qwen3-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 32B
-      mad_tag: pyt_train_qwen2.5-32b
-      model_repo: Qwen2.5-32B
-      url: https://huggingface.co/Qwen/Qwen2.5-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 72B
-      mad_tag: pyt_train_qwen2.5-72b
-      model_repo: Qwen2.5-72B
-      url: https://huggingface.co/Qwen/Qwen2.5-72B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2 1.5B
-      mad_tag: pyt_train_qwen2-1.5b
-      model_repo: Qwen2-1.5B
-      url: https://huggingface.co/Qwen/Qwen2-1.5B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 2 7B
-      mad_tag: pyt_train_qwen2-7b
-      model_repo: Qwen2-7B
-      url: https://huggingface.co/Qwen/Qwen2-7B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-  - group: Flux
-    tag: flux
-    models:
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -1,13 +1,13 @@
 dockers:
-  - pull_tag: rocm/megatron-lm:v25.8_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
    components:
-      ROCm: 6.4.3
-      Primus: 927a717
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      hipBLASLt: d1b517fc7a
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
      Triton: 3.3.0
      RCCL: 2.22.3
 model_groups:
--- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -1,24 +0,0 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.8
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+a1e66aae
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-d1b517fc7a
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: primus_pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-    - model: Llama 3.1 70B
-      mad_tag: primus_pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
-      precision: BF16
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,16 +1,38 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.8
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+a1e66aae
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-d1b517fc7a
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
 model_groups:
-  - group: Meta Llama
-    tag: llama
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
    models:
    - model: Llama 4 Scout 17B-16E
      mad_tag: pyt_train_llama-4-scout-17b-16e
@@ -53,19 +75,19 @@ model_groups:
      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
+      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3.1 70B
      mad_tag: pyt_train_llama-3.1-70b
      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora]
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
    - model: Llama 3.1 405B
      mad_tag: pyt_train_llama-3.1-405b
      model_repo: Llama-3.1-405B
      url: https://huggingface.co/meta-llama/Llama-3.1-405B
      precision: BF16
-      training_modes: [finetune_qlora]
+      training_modes: [finetune_qlora, HF_finetune_lora]
    - model: Llama 3 8B
      mad_tag: pyt_train_llama-3-8b
      model_repo: Llama-3-8B
@@ -95,84 +117,4 @@ model_groups:
      model_repo: Llama-2-70B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora]
-  - group: OpenAI
-    tag: openai
-    models:
-    - model: GPT OSS 20B
-      mad_tag: pyt_train_gpt_oss_20b
-      model_repo: GPT-OSS-20B
-      url: https://huggingface.co/openai/gpt-oss-20b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-    - model: GPT OSS 120B
-      mad_tag: pyt_train_gpt_oss_120b
-      model_repo: GPT-OSS-120B
-      url: https://huggingface.co/openai/gpt-oss-120b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-  - group: Qwen
-    tag: qwen
-    models:
-    - model: Qwen 3 8B
-      mad_tag: pyt_train_qwen3-8b
-      model_repo: Qwen3-8B
-      url: https://huggingface.co/Qwen/Qwen3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 3 32B
-      mad_tag: pyt_train_qwen3-32b
-      model_repo: Qwen3-32
-      url: https://huggingface.co/Qwen/Qwen3-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 32B
-      mad_tag: pyt_train_qwen2.5-32b
-      model_repo: Qwen2.5-32B
-      url: https://huggingface.co/Qwen/Qwen2.5-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 72B
-      mad_tag: pyt_train_qwen2.5-72b
-      model_repo: Qwen2.5-72B
-      url: https://huggingface.co/Qwen/Qwen2.5-72B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2 1.5B
-      mad_tag: pyt_train_qwen2-1.5b
-      model_repo: Qwen2-1.5B
-      url: https://huggingface.co/Qwen/Qwen2-1.5B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 2 7B
-      mad_tag: pyt_train_qwen2-7b
-      model_repo: Qwen2-7B
-      url: https://huggingface.co/Qwen/Qwen2-7B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-  - group: Stable Diffusion
-    tag: sd
-    models:
-    - model: Stable Diffusion XL
-      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
-      model_repo: SDXL
-      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
-      precision: BF16
-      training_modes: [finetune_lora]
-  - group: Flux
-    tag: flux
-    models:
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: NCF
-    tag: ncf
-    models:
-    - model: NCF
-      mad_tag: pyt_ncf_training
-      model_repo:
-      url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
-      precision: FP32
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/data/reference/gpu-atomics-operation/cas-atomics_nopcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/cas-atomics_nopcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native,⚠️ Scope Downgrade - CAS
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native,⚠️ Scope Downgrade - CAS
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native,⚠️ Scope Downgrade - CAS
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
--- a/docs/data/reference/gpu-atomics-operation/cas-atomics_pcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/cas-atomics_pcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
--- a/docs/data/reference/gpu-atomics-operation/hw-atomics_nopcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/hw-atomics_nopcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
--- a/docs/data/reference/gpu-atomics-operation/hw-atomics_pcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/hw-atomics_pcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
--- a/docs/data/reference/precision-support/precision-support.yaml
+++ b/docs/data/reference/precision-support/precision-support.yaml
@@ -1,391 +0,0 @@
-# rocm-library-support.yaml
-library_groups:
-  - group: "ML & Computer Vision"
-    tag: "ml-cv"
-    libraries:
-      - name: "Composable Kernel"
-        tag: "composable-kernel"
-        doc_link: "composable_kernel:reference/Composable_Kernel_supported_scalar_types"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "float4"
-            support: "✅"
-          - type: "float6 (E2M3)"
-            support: "✅"
-          - type: "float6 (E3M2)"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "MIGraphX"
-        tag: "migraphx"
-        doc_link: "amdmigraphx:reference/cpp"
-        data_types:
-          - type: "int8"
-            support: "⚠️"
-          - type: "int16"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "MIOpen"
-        tag: "miopen"
-        doc_link: "miopen:reference/datatypes"
-        data_types:
-          - type: "int8"
-            support: "⚠️"
-          - type: "int32"
-            support: "⚠️"
-          - type: "float8 (E4M3)"
-            support: "⚠️"
-          - type: "float8 (E5M2)"
-            support: "⚠️"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "⚠️"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "⚠️"
-
-  - group: "Communication"
-    tag: "communication"
-    libraries:
-      - name: "RCCL"
-        tag: "rccl"
-        doc_link: "rccl:api-reference/library-specification"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-  - group: "Math Libraries"
-    tag: "math-libs"
-    libraries:
-      - name: "hipBLAS"
-        tag: "hipblas"
-        doc_link: "hipblas:reference/data-type-support"
-        data_types:
-          - type: "float16"
-            support: "⚠️"
-          - type: "bfloat16"
-            support: "⚠️"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipBLASLt"
-        tag: "hipblaslt"
-        doc_link: "hipblaslt:reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "float4"
-            support: "✅"
-          - type: "float6 (E2M3)"
-            support: "✅"
-          - type: "float6 (E3M2)"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-
-      - name: "hipFFT"
-        tag: "hipfft"
-        doc_link: "hipfft:reference/fft-api-usage"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipRAND"
-        tag: "hiprand"
-        doc_link: "hiprand:api-reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "Output only"
-          - type: "int16"
-            support: "Output only"
-          - type: "int32"
-            support: "Output only"
-          - type: "int64"
-            support: "Output only"
-          - type: "float16"
-            support: "Output only"
-          - type: "float32"
-            support: "Output only"
-          - type: "float64"
-            support: "Output only"
-
-      - name: "hipSOLVER"
-        tag: "hipsolver"
-        doc_link: "hipsolver:reference/precision"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipSPARSE"
-        tag: "hipsparse"
-        doc_link: "hipsparse:reference/precision"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipSPARSELt"
-        tag: "hipsparselt"
-        doc_link: "hipsparselt:reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-
-      - name: "rocBLAS"
-        tag: "rocblas"
-        doc_link: "rocblas:reference/data-type-support"
-        data_types:
-          - type: "float16"
-            support: "⚠️"
-          - type: "bfloat16"
-            support: "⚠️"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocFFT"
-        tag: "rocfft"
-        doc_link: "rocfft:reference/api"
-        data_types:
-          - type: "float16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocRAND"
-        tag: "rocrand"
-        doc_link: "rocrand:api-reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "Output only"
-          - type: "int16"
-            support: "Output only"
-          - type: "int32"
-            support: "Output only"
-          - type: "int64"
-            support: "Output only"
-          - type: "float16"
-            support: "Output only"
-          - type: "float32"
-            support: "Output only"
-          - type: "float64"
-            support: "Output only"
-
-      - name: "rocSOLVER"
-        tag: "rocsolver"
-        doc_link: "rocsolver:reference/precision"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocSPARSE"
-        tag: "rocsparse"
-        doc_link: "rocsparse:reference/precision"
-        data_types:
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocWMMA"
-        tag: "rocwmma"
-        doc_link: "rocwmma:api-reference/api-reference-guide"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int32"
-            support: "Output only"
-          - type: "float8 (E4M3)"
-            support: "Input only"
-          - type: "float8 (E5M2)"
-            support: "Input only"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "tensorfloat32"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "Tensile"
-        tag: "tensile"
-        doc_link: "tensile:reference/precision-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "float8 (E4M3)"
-            support: "✅"
-          - type: "float8 (E5M2)"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "tensorfloat32"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-  - group: "Primitives"
-    tag: "primitives"
-    libraries:
-      - name: "hipCUB"
-        tag: "hipcub"
-        doc_link: "hipcub:api-reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int16"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "hipTensor"
-        tag: "hiptensor"
-        doc_link: "hiptensor:api-reference/api-reference"
-        data_types:
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocPRIM"
-        tag: "rocprim"
-        doc_link: "rocprim:reference/data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int16"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float16"
-            support: "✅"
-          - type: "bfloat16"
-            support: "✅"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
-
-      - name: "rocThrust"
-        tag: "rocthrust"
-        doc_link: "rocthrust:data-type-support"
-        data_types:
-          - type: "int8"
-            support: "✅"
-          - type: "int16"
-            support: "✅"
-          - type: "int32"
-            support: "✅"
-          - type: "int64"
-            support: "✅"
-          - type: "float16"
-            support: "⚠️"
-          - type: "bfloat16"
-            support: "⚠️"
-          - type: "float32"
-            support: "✅"
-          - type: "float64"
-            support: "✅"
--- a/docs/data/rocm-software-stack-7_0_0.jpg
+++ b/docs/data/rocm-software-stack-7_0_0.jpg
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -23,126 +23,93 @@ The table below summarizes information about ROCm-enabled deep learning framewor
      - Installation options
      - GitHub

-    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
+    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`_
      - .. raw:: html
-
+         
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
-        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`_ 
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`_
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`_ 
+        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`_
      - .. raw:: html
-
+         
          <a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
-
-    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
+   
+    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`_
      - .. raw:: html
-
+         
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`_
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`_

      - .. raw:: html
-
+         
          <a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a> 

-    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
+    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`_
      - .. raw:: html
-
+         
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`_
      - .. raw:: html
-
+         
          <a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
-
-    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
+   
+    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`_
      - .. raw:: html
-
+         
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`_
      - .. raw:: html
-
+         
          <a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>

-    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
+    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`_
      - .. raw:: html
-
+         
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`_
      - .. raw:: html
-
+         
          <a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
-
-    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
+   
+    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`_
      - .. raw:: html
-
+         
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`_
      - .. raw:: html
-
+         
          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 

-    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
+    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`_
      - .. raw:: html
-
+         
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`_
      - .. raw:: html
-
+         
          <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
-
-    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
+   
+    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`_
      - .. raw:: html
-
+         
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`_ 
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`_

      - .. raw:: html
+         
+          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>      

-          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
-
-    * - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
-      - .. raw:: html
-
-          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
-      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
-      - .. raw:: html
-
-          <a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
-
-    * - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
-      - .. raw:: html
-
-          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
-      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#build-your-own-docker-image>`__
-      - .. raw:: html
-
-          <a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
-
-    * - `FlashInfer <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/flashinfer-compatibility.html>`__
-      - .. raw:: html
-
-          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html"><i class="fas fa-link fa-lg"></i></a>
-      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#use-a-prebuilt-docker-image-with-flashinfer-pre-installed>`__
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#build-your-own-docker-image>`__
-      - .. raw:: html
-
-          <a href="https://github.com/ROCm/flashinfer"><i class="fab fa-github fa-lg"></i></a>

 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.
@@ -157,3 +124,10 @@ through the following guides.

 * :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`

+
+
+
+
+
+
+
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -939,7 +939,7 @@ hipBLASLt benchmarking
 The GEMM library
 `hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
 provides a benchmark tool for its supported operations. Refer to the
-`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/bench/README.md>`_
+`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md>`_
 for details.

 * Example 1: Benchmark mix fp8 GEMM
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
@@ -1,445 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
-                 ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker-812:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
-
-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
-   accelerators and includes the following components:
-
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      * - `ROCm <https://github.com/ROCm/ROCm>`__
-        - {{ unified_docker.rocm_version }}
-
-      * - `vLLM <https://docs.vllm.ai/en/latest>`__
-        - {{ unified_docker.vllm_version }}
-
-      * - `PyTorch <https://github.com/ROCm/pytorch>`__
-        - {{ unified_docker.pytorch_version }}
-
-      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
-        - {{ unified_docker.hipblaslt_version }}
-
-With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-812>` for
-MI300X series accelerators.
-
-What's new
-==========
-
-The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
-
-* Upgraded to vLLM v0.10.
-
-* FP8 KV cache support via AITER.
-
-* Full graph capture support via AITER.
-
-Supported models
-================
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
-
-   .. _vllm-benchmark-available-models-812:
-
-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-      <div class="row">
-         <div class="col-2 me-2 model-param-head">Model group</div>
-         <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-         </div>
-      </div>
-
-      <div class="row mt-1">
-         <div class="col-2 me-2 model-param-head">Model</div>
-         <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-         </div>
-      </div>
-      </div>
-
-   .. _vllm-benchmark-vllm-812:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-
-      {% endfor %}
-   {% endfor %}
-
-.. note::
-
-   vLLM is a toolkit and library for LLM inference and serving. AMD implements
-   high-performance custom kernels and modules in vLLM to enhance performance.
-   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-   more information.
-
-.. _vllm-benchmark-performance-measurements-812:
-
-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and serving measurements for inferencing popular AI models.
-
-.. important::
-
-   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
-
-   Pull the Docker image
-   =====================
-
-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ unified_docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad-812:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
-
-            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{model.mad_tag}} \
-                      --keep-model-dir \
-                      --live-output \
-                      --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
-            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
-            and ``{{ model.mad_tag }}_serving.csv``.
-
-            Although the :ref:`available models
-            <vllm-benchmark-available-models-812>` are preconfigured to collect
-            offline throughput and online serving performance data, you can
-            also change the benchmarking parameters. See the standalone
-            benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
-               the ``--tunableop on`` argument in your run.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
-               performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            .. rubric:: Download the Docker image and required scripts
-
-            1. Run the vLLM benchmark tool independently by starting the
-               `Docker container <{{ unified_docker.docker_hub_url }}>`_
-               as shown in the following snippet.
-
-               .. code-block:: shell
-
-                  docker pull {{ unified_docker.pull_tag }}
-                  docker run -it \
-                      --device=/dev/kfd \
-                      --device=/dev/dri \
-                      --group-add video \
-                      --shm-size 16G \
-                      --security-opt seccomp=unconfined \
-                      --security-opt apparmor=unconfined \
-                      --cap-add=SYS_PTRACE \
-                      -v $(pwd):/workspace \
-                      --env HUGGINGFACE_HUB_CACHE=/workspace \
-                      --name test \
-                      {{ unified_docker.pull_tag }}
-
-            2. In the Docker container, clone the ROCm MAD repository and navigate to the
-               benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD/scripts/vllm
-
-            3. To start the benchmark, use the following command with the appropriate options.
-
-               .. code-block::
-
-                  ./run.sh \
-                      --config $CONFIG_CSV \
-                      --model_repo {{ model.model_repo }} \
-                      <overrides>
-
-               .. dropdown:: Benchmark options
-                  :open:
-
-                  .. list-table::
-                     :header-rows: 1
-                     :align: center
-
-                     * - Name
-                       - Options
-                       - Description
-
-                     * - ``--config``
-                       - ``configs/default.csv``
-                       - Run configs from the CSV for the chosen model repo and benchmark.
-
-                     * -
-                       - ``configs/extended.csv``
-                       - 
-
-                     * -
-                       - ``configs/performance.csv``
-                       - 
-
-                     * - ``--benchmark``
-                       - ``throughput``
-                       - Measure offline end-to-end throughput.
-
-                     * - 
-                       - ``serving``
-                       - Measure online serving performance.
-
-                     * - 
-                       - ``all``
-                       - Measure both throughput and serving.
-
-                     * - `<overrides>`
-                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
-                       - Additional overrides to the config CSV.
-
-                  The input sequence length, output sequence length, and tensor parallel (TP) are
-                  already configured. You don't need to specify them with this script.
-
-               .. note::
-
-                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
-
-                  If you encounter the following error, pass your access-authorized Hugging
-                  Face token to the gated models.
-
-                  .. code-block::
-
-                     OSError: You are trying to access a gated repo.
-
-                     # pass your HF_TOKEN
-                     export HF_TOKEN=$your_personal_hf_token
-
-            .. rubric:: Benchmarking examples
-
-            Here are some examples of running the benchmark with various options:
-
-            * Throughput benchmark
-
-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block:: shell
-
-                 export MAD_MODEL_NAME={{ model.mad_tag }}
-                 ./run.sh \
-                     --config configs/default.csv \
-                     --model_repo {{model.model_repo}} \
-                     --benchmark throughput
-
-              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
-
-            * Serving benchmark
-
-              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block::
-
-                 export MAD_MODEL_NAME={{ model.mad_tag }}
-                 ./run.sh \
-                     --config configs/default.csv \
-                     --model_repo {{model.model_repo}} \
-                     --benchmark serving
-
-              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Advanced usage
-==============
-
-For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
-
-Reproducing the Docker image
----------------------------
-
-To reproduce this ROCm/vLLM Docker image release, follow these steps:
-
-1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/vllm.git
-
-2. Checkout the specific release commit.
-
-   .. code-block:: shell
-
-      cd vllm
-      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
-
-3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
-
-   .. code-block:: shell
-
-      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
-
-Further reading
-===============
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
-
- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
-
- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909.rst
@@ -1,448 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker-909:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-
-   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
-   accelerators and includes the following components:
-
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components.items() %}
-      * - {{ component_name }}
-        - {{ component_version }}
-      {% endfor %}
-
-With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-909>` for
-MI300X series accelerators.
-
-What's new
-==========
-
-The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
-
-* Upgraded to vLLM v0.10.1.
-
-* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.
-
-* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.
-
-.. _vllm-benchmark-supported-models-909:
-
-Supported models
-================
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   .. _vllm-benchmark-available-models-909:
-
-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-   .. _vllm-benchmark-vllm-909:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{ model.mad_tag }}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
-         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
-      {% endif %}
-
-      {% endfor %}
-   {% endfor %}
-
-.. _vllm-benchmark-performance-measurements-909:
-
-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and serving measurements for inferencing popular AI models.
-
-.. important::
-
-   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   Pull the Docker image
-   =====================
-
-   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad-909:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            The following run command is tailored to {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
-
-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
-
-            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{model.mad_tag}} \
-                      --keep-model-dir \
-                      --live-output \
-                      --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
-            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
-            and ``{{ model.mad_tag }}_serving.csv``.
-
-            Although the :ref:`available models
-            <vllm-benchmark-available-models-909>` are preconfigured to collect
-            offline throughput and online serving performance data, you can
-            also change the benchmarking parameters. See the standalone
-            benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
-               the ``--tunableop on`` argument in your run.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
-               performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            The following commands are optimized for {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
-
-            .. seealso::
-
-               For more information on configuration, see the `config files
-               <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
-               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
-               for descriptions of available configuration options
-               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
-               additional benchmarking information.
-
-            .. rubric:: Launch the container
-
-            You can run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
-            in the following snippet.
-
-            .. code-block:: shell
-
-               docker pull {{ docker.pull_tag }}
-               docker run -it \
-                   --device=/dev/kfd \
-                   --device=/dev/dri \
-                   --group-add video \
-                   --shm-size 16G \
-                   --security-opt seccomp=unconfined \
-                   --security-opt apparmor=unconfined \
-                   --cap-add=SYS_PTRACE \
-                   -v $(pwd):/workspace \
-                   --env HUGGINGFACE_HUB_CACHE=/workspace \
-                   --name test \
-                   {{ docker.pull_tag }}
-
-            .. rubric:: Throughput command
-
-            Use the following command to start the throughput benchmark.
-
-            .. code-block:: shell
-
-               model={{ model.model_repo }}
-               tp={{ model.config.tp }}
-               num_prompts=1024
-               in=128
-               out=128
-               dtype={{ model.config.dtype }}
-               kv_cache_dtype={{ model.config.kv_cache_dtype }}
-               max_num_seqs=1024
-               max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
-               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-               max_model_len={{ model.config.max_model_len }}
-
-               vllm bench throughput --model $model \
-                   -tp $tp \
-                   --num-prompts $num_prompts \
-                   --input-len $in \
-                   --output-len $out \
-                   --dtype $dtype \
-                   --kv-cache-dtype $kv_cache_dtype \
-                   --max-num-seqs $max_num_seqs \
-                   --max-seq-len-to-capture $max_seq_len_to_capture \
-                   --max-num-batched-tokens $max_num_batched_tokens \
-                   --max-model-len $max_model_len \
-                   --trust-remote-code \
-                   --output-json ${model}_throughput.json \
-                   --gpu-memory-utilization 0.9
-
-            .. rubric:: Serving command
-
-            1. Start the server using the following command:
-
-               .. code-block:: shell
-
-                  model={{ model.model_repo }}
-                  tp={{ model.config.tp }}
-                  dtype={{ model.config.dtype }}
-                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
-                  max_num_seqs=256
-                  max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
-                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-                  max_model_len={{ model.config.max_model_len }}
-
-                  vllm serve $model \
-                      -tp $tp \
-                      --dtype $dtype \
-                      --kv-cache-dtype $kv_cache_dtype \
-                      --max-num-seqs $max_num_seqs \
-                      --max-seq-len-to-capture $max_seq_len_to_capture \
-                      --max-num-batched-tokens $max_num_batched_tokens \
-                      --max-model-len $max_model_len \
-                      --no-enable-prefix-caching \
-                      --swap-space 16 \
-                      --disable-log-requests \
-                      --trust-remote-code \
-                      --gpu-memory-utilization 0.9
-
-               Wait until the model has loaded and the server is ready to accept requests.
-
-            2. On another terminal on the same machine, run the benchmark:
-
-               .. code-block:: shell
-
-                  # Connect to the container
-                  docker exec -it test bash
-
-                  # Wait for the server to start
-                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
-
-                  # Run the benchmark
-                  model={{ model.model_repo }}
-                  max_concurrency=1
-                  num_prompts=10
-                  in=128
-                  out=128
-                  vllm bench serve --model $model \
-                      --percentile-metrics "ttft,tpot,itl,e2el" \
-                      --dataset-name random \
-                      --ignore-eos \
-                      --max-concurrency $max_concurrency \
-                      --num-prompts $num_prompts \
-                      --random-input-len $in \
-                      --random-output-len $out \
-                      --trust-remote-code \
-                      --save-result \
-                      --result-filename ${model}_serving.json
-
-            .. note::
-
-               For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
-               try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Advanced usage
-==============
-
-For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
-
-Reproducing the Docker image
----------------------------
-
-To reproduce this ROCm/vLLM Docker image release, follow these steps:
-
-1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/vllm.git
-
-2. Checkout the specific release commit.
-
-   .. code-block:: shell
-
-      cd vllm
-      git checkout 6663000a391911eba96d7864a26ac42b07f6ef29
-
-3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
-
-   .. code-block:: shell
-
-      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
-
-Further reading
-===============
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-  a brief introduction to vLLM and optimization strategies.
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
@@ -120,7 +120,7 @@ vLLM inference performance testing
   ==================================

   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/blob/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker/README.md>`__.
+   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.

   System validation
   =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -16,7 +16,7 @@ vLLM inference performance testing

 .. _vllm-benchmark-unified-docker-715:

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml

   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -46,7 +46,7 @@ vLLM inference performance testing
        - {{ unified_docker.hipblaslt_version }}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-715>` for
+inference performance numbers <vllm-benchmark-performance-measurements>` for
 MI300X series accelerators.

 What's new
@@ -69,7 +69,7 @@ The following is summary of notable changes since the :doc:`previous ROCm/vLLM D
 Supported models
 ================

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml

   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -162,7 +162,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml

   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -219,7 +219,7 @@ system's configuration.
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.

-            Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
            to collect latency and throughput performance data, you can also change the benchmarking
            parameters. See the standalone benchmarking tab for more information.

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -16,121 +16,103 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Components
     - Resources

-   * - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006``
-       (latest)
-     -
-       * ROCm 7.0.0
-       * vLLM 0.10.2
-       * PyTorch 2.9.0
-     -
-       * :doc:`Documentation <../vllm>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5>`__
-
-   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909``
-     -
-       * ROCm 6.4.1
-       * vLLM 0.10.1
-       * PyTorch 2.7.0
-     -
-       * :doc:`Documentation <vllm-0.10.1-20250909>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__
-
   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
-     -
+       (latest)
+     - 
       * ROCm 6.4.1
       * vLLM 0.10.0
       * PyTorch 2.7.0
-     -
-       * :doc:`Documentation <vllm-0.10.0-20250812>`
+     - 
+       * :doc:`Documentation <../vllm>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
-     -
+     - 
       * ROCm 6.4.1
       * vLLM 0.9.1
       * PyTorch 2.7.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.9.1-20250715>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
-     -
+     - 
       * ROCm 6.4.1
       * vLLM 0.9.1
       * PyTorch 2.7.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.9.1-20250702>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605``
-     -
+     - 
       * ROCm 6.4.1
       * vLLM 0.9.0.1
       * PyTorch 2.7.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.9.0.1-20250605>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c>`__

   * - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250521``
-     -
+     - 
       * ROCm 6.3.1
       * 0.8.5 vLLM (0.8.6.dev)
       * PyTorch 2.7.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.8.5-20250521>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`__

   * - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513``
-     -
+     - 
       * ROCm 6.3.1
       * vLLM 0.8.5
       * PyTorch 2.7.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.8.5-20250513>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`__

   * - ``rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415``
-     -
+     - 
       * ROCm 6.3.1
       * vLLM 0.8.3
       * PyTorch 2.7.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.8.3-20250415>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`__

   * - ``rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325``
-     -
+     - 
       * ROCm 6.3.1
       * vLLM 0.7.3
       * PyTorch 2.7.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.7.3-20250325>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`__

   * - ``rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6``
-     -
+     - 
       * ROCm 6.3.1
       * vLLM 0.6.6
       * PyTorch 2.7.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.6.6>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`__

   * - ``rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4``
-     -
+     - 
       * ROCm 6.2.1
       * vLLM 0.6.4
       * PyTorch 2.5.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.6.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`__

   * - ``rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50``
-     -
+     - 
       * ROCm 6.2.0
       * vLLM 0.4.3
       * PyTorch 2.4.0
-     -
+     - 
       * :doc:`Documentation <vllm-0.4.3>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`__

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -16,7 +16,7 @@ PyTorch inference performance testing

   The `ROCm PyTorch Docker <https://hub.docker.com/r/rocm/pytorch/tags>`_ image offers a prebuilt,
   optimized environment for testing model inference performance on AMD Instinct™ MI300X series
-   GPUs. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD)
+   accelerators. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD)
   tool with the ROCm PyTorch container to test inference performance on various models efficiently.

   .. _pytorch-inference-benchmark-available-models:
@@ -31,30 +31,26 @@ PyTorch inference performance testing
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>

-         <div class="row gx-0 pt-1" style="display: none;">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
+        <div class="row mt-1" style="display: none;">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+            <div class="col-12 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
      {% endfor %}
-            </div>
-         </div>
+   {% endfor %}
+          </div>
+        </div>
      </div>

   {% for model_group in model_groups %}
@@ -175,7 +171,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
@@ -1,257 +0,0 @@
-.. meta::
-   :description: SGLang multi-node disaggregated distributed inference using Mooncake
-   :keywords: model, sglang, mooncake, disagg, disaggregated, distributed, multi-node, docker
-
-******************************************
-SGLang distributed inference with Mooncake
-******************************************
-
-As LLM inference increasingly demands handling massive models and dynamic workloads, efficient
-distributed inference becomes essential. Traditional co-located architectures face bottlenecks due
-to tightly coupled memory and compute resources, which limits scalability and flexibility.
-Disaggregated inference refers to the process of splitting the inference of LLMs into distinct
-phases. This architecture, facilitated by libraries like Mooncake, uses high-bandwidth
-RDMA to transfer the Key-Value (KV) cache between prefill and decode nodes.
-This allows for independent resource scaling and optimization, resulting in
-improved efficiency and throughput.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-
-   `SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
-   serving engine for large language models (LLMs) and vision models. The
-   ROCm-enabled `SGLang base Docker image <{{ docker.docker_hub_url }}>`__
-   bundles SGLang with PyTorch, which is optimized for AMD Instinct MI300X series
-   GPUs. It includes the following software components:
-
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components.items() %}
-      * - {{ component_name }}
-        - {{ component_version }}
-      {% endfor %}
-
-The following guides on setting up and running SGLang and Mooncake for disaggregated
-distributed inference on a Slurm cluster using AMD Instinct MI300X series GPUs backed by
-Mellanox CX-7 NICs.
-
-Prerequisites
-=============
-
-Before starting, ensure you have:
-
-* A Slurm cluster with at least three nodes: one for the proxy, one for prefill (``xP``), and one for decode (``yD``).
-
-  ``Nodes -> xP + yD + 1``
-
-* A Dockerized environment with SGLang, Mooncake, etcd, and NIC drivers built in. See :ref:`sglang-disagg-inf-build-docker-image` for instructions.
-
-* A shared filesystem for storing models, scripts, and logs (cluster-specific).
-
-Supported models
-================
-
-The following models are supported for SGLang disaggregated prefill/decode
-inference. Some instructions, commands, and recommendations in this
-documentation might vary by selected model.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
-
-   {% set model_groups = data.model_groups %}
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model type</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.model_repo | lower }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.model_repo | lower }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{ model.model_repo }}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`__ to learn more about this model.
-         Some models require access authorization prior to use through an external license agreement with a third party.
-
-      {% endfor %}
-   {% endfor %}
-
-.. _sglang-disagg-inf-build-docker-image:
-
-Build the Docker image
----------------------
-
-Get the Dockerfile located in
-`<https://github.com/ROCm/MAD/blob/develop/docker/sglang_disagg_inference.ubuntu.amd.Dockerfile>`__.
-It uses `lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
-<https://hub.docker.com/layers/lmsysorg/sglang/v0.4.9.post1-rocm630/images/sha256-2f6b1748e4bcc70717875a7da76c87795fd8aa46a9646e08d38aa7232fc78538>`__
-as the base Docker image and installs the necessary components for Mooncake, etcd, and Mellanox network
-drivers.
-
-.. code-block:: shell
-
-   git clone https://github.com/ROCm/MAD.git
-   cd MAD/docker
-   docker build \
-       -t sglang_disagg_pd_image \
-       -f sglang_disagg_inference.ubuntu.amd.Dockerfile .
-
-Benchmarking
-============
-
-The `<https://github.com/ROCm/MAD/tree/develop/scripts/sglang_disagg>`__
-repository contains scripts to launch SGLang inference with prefill/decode
-disaggregation via Mooncake for supported models.
-
-* `scripts/sglang_dissag/run_xPyD_models.slurm <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_disagg/run_xPyD_models.slurm>`__
-  -- the main Slurm batch script to launch Docker containers on all nodes using ``sbatch`` or ``salloc``.
-
-* `scripts/sglang_dissag/sglang_disagg_server.sh <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_disagg/sglang_disagg_server.sh>`__
-  -- the entrypoint script that runs inside each container to start the correct service -- proxy, prefill, or decode.
-
-* `scripts/sglang_dissag/benchmark_xPyD.sh <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_disagg/benchmark_xPyD.sh>`__
-  -- the benchmark script to run the GSM8K accuracy benchmark and the SGLang benchmarking tool for performance measurement.
-
-* `scripts/sglang_dissag/benchmark_parser.py <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_disagg/benchmark_parser.py>`__
-  -- the log parser script to be run on the concurrency benchmark log file to generate tabulated data.
-
-Launch the service
------------------
-
-The service is deployed using a Slurm batch script that orchestrates the containers across the
-allocated nodes.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
-
-   {% set model_groups = data.model_groups %}
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{ model.model_repo }}
-
-      .. code-block:: shell
-
-         # Clone the MAD repo if you haven't already and
-         # navigate to the scripts directory
-         git clone https://github.com/ROCm/MAD.git
-         cd MAD/scripts/sglang_disagg/
-
-         # Slurm sbatch run command
-         export DOCKER_IMAGE_NAME=sglang_disagg_pd_image
-         export xP=<num_prefill_nodes>
-         export yD=<num_decode_nodes>
-         export MODEL_NAME={{ model.model_repo }}
-         # num_nodes = xP + yD + 1
-         sbatch -N <num_nodes> -n <num_nodes> --nodelist=<Nodes> run_xPyD_models.slurm
-
-      {% endfor %}
-   {% endfor %}
-
-Post-run logs and testing
-------------------------
-
-Logs are stored in your shared filesystem in the directory specified by the ``LOG_PATH`` variable in the Slurm script.
-A new directory named after the Slurm job ID is created for each run.
-
-Inside that directory, you can access various logs:
-
-* ``pd_sglang_bench_serving.sh_NODE<...>.log`` -- the main log for each server node.
-
-* ``etcd_NODE<...>.log`` -- logs for etcd services.
-
-* ``prefill_NODE<...>.log`` -- logs for the prefill services.
-
-* ``decode_NODE<...>.log`` -- logs for the decode services.
-
-Use the benchmark parser script for concurrency logs to tabulate different data.
-
-.. code-block:: shell
-
-   python3 benchmark_parser.py <log_path/benchmark_XXX_CONCURRENCY.log>
-
-To verify the service is responsive, you can try sending a ``curl`` request to test the launched
-server from the Docker container on the proxy node. For example:
-
-.. code-block:: shell
-
-   curl -X POST http://127.0.0.1:30000/generate \
-       -H "Content-Type: application/json" \
-       -d '{ "text": "Let me tell you a story ", "sampling_params": { "temperature": 0.3 } }'
-
-Known issues
-============
-
-When running larger models, such as DeepSeek-V3 and Llama-3.1-405B-Instruct-FP8-KV, at
-higher concurrency levels (512+), the following error might occur:
-
-.. code-block:: shell-session
-
-   <TransferEncodingError: 400, message:
-    Not enough data to satisfy transfer length header.
-
-   The above exception was the direct cause of the following exception:
-
-   Traceback (most recent call last):
-   ...
-
-This leads to dropping requests and lower throughput.
-
-Further reading
-===============
-
- To learn about Mooncake, see `Welcome to Mooncake <https://kvcache-ai.github.io/Mooncake/>`__.
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/sgl-project/sglang/tree/main/benchmark/blog_v0_2>`__.
-
- See the base upstream Docker image on `Docker Hub <https://hub.docker.com/layers/lmsysorg/sglang/v0.5.2rc1-rocm700-mi30x/images/sha256-10c4ee502ddba44dd8c13325e6e03868bfe7f43d23d0a44780a8ee8b393f4729>`__.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
-
- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
-
- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`previous-versions/sglang-history` to find documentation for previous releases
-of SGLang inference performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
@@ -2,19 +2,19 @@
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
   :keywords: model, MAD, automation, dashboarding, validate

-*****************************************************************
-SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
-*****************************************************************
+************************************
+SGLang inference performance testing
+************************************

 .. _sglang-benchmark-unified-docker:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml

-   {% set docker = data.dockers[0] %}
+   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}

   `SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
   serving engine for large language models (LLMs) and vision models. The
-   ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__
+   ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
   bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
   accelerators. It includes the following software components:

@@ -24,10 +24,14 @@ SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
      * - Software component
        - Version

-      {% for component_name, component_version in docker.components.items() %}
-      * - {{ component_name }}
-        - {{ component_version }}
-      {% endfor %}
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `SGLang <https://docs.sglang.ai/index.html>`__
+        - {{ unified_docker.sglang_version }} 
+
+      * - `PyTorch <https://github.com/pytorch/pytorch>`__
+        - {{ unified_docker.pytorch_version }} 

 System validation
 =================
@@ -46,8 +50,8 @@ system's configuration.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml

-   {% set unified_docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
+   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
+   {% set model_groups = data.sglang_benchmark.model_groups %}

   Pull the Docker image
   =====================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -1,142 +1,124 @@
 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the ROCm vLLM Docker image.
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

 **********************************
 vLLM inference performance testing
 **********************************

-.. _vllm-benchmark-unified-docker-930:
+.. _vllm-benchmark-unified-docker-812:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

-   {% set docker = data.dockers[0] %}
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}

-   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a
-   prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X
-   GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored
-   specifically for AMD data center GPUs and includes the following components:
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

-   .. tab-set::
+   .. list-table::
+      :header-rows: 1

-      .. tab-item:: {{ docker.pull_tag }}
+      * - Software component
+        - Version

-         .. list-table::
-            :header-rows: 1
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}

-            * - Software component
-              - Version
+      * - `vLLM <https://docs.vllm.ai/en/latest>`__
+        - {{ unified_docker.vllm_version }}

-            {% for component_name, component_version in docker.components.items() %}
-            * - {{ component_name }}
-              - {{ component_version }}
-            {% endfor %}
+      * - `PyTorch <https://github.com/ROCm/pytorch>`__
+        - {{ unified_docker.pytorch_version }}
+
+      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+        - {{ unified_docker.hipblaslt_version }}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-930>` for
-AMD Instinct GPUs.
+inference performance numbers <vllm-benchmark-performance-measurements>` for
+MI300X series accelerators.

 What's new
 ==========

 The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.

-* Added support for AMD Instinct MI355X and MI350X GPUs.
+* Upgraded to vLLM v0.10.

-* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`.
+* FP8 KV cache support via AITER.

-  * Llama 4 Scout and Maverick
-
-  * DeepSeek R1 0528 FP8
-
-  * MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4
-
-  * GPT OSS 20B and 120B
-
-  * Qwen 3 32B, 30B-A3B, and 235B-A22B
-
-* Removed the deprecated ``--max-seq-len-to-capture`` flag.
-
-* ``--gpu-memory-utilization`` is now configurable via the `configuration files
-  <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__ in the MAD
-  repository.
-
-.. _vllm-benchmark-supported-models-930:
+* Full graph capture support via AITER.

 Supported models
 ================

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}

-   .. _vllm-benchmark-available-models-930:
+   .. _vllm-benchmark-available-models-812:

   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started. MXFP4 models
-   are only supported on MI355X and MI350X GPUs.
+   documentation might vary by model -- select one to get started.

   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
+      <div class="row">
+         <div class="col-2 me-2 model-param-head">Model group</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
         </div>
      </div>

-   .. _vllm-benchmark-vllm-930:
+      <div class="row mt-1">
+         <div class="col-2 me-2 model-param-head">Model</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+         </div>
+      </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-812:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}

-   .. container:: model-doc {{ model.mad_tag }}
-
-
-      {% if model.precision == "float4" %}
-      .. important::
-
-         MXFP4 is supported only on MI355X and MI350X GPUs.
-      {% endif %}
+   .. container:: model-doc {{model.mad_tag}}

      .. note::

         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
         Some models require access authorization prior to use via an external license agreement through a third party.
-      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
-         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
-      {% endif %}
-      {% if model.precision == "float4" and model.model_repo.startswith("amd") %}
-         This model uses FP4 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
-      {% endif %}

      {% endfor %}
   {% endfor %}

-.. _vllm-benchmark-performance-measurements-930:
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+.. _vllm-benchmark-performance-measurements-812:

 Performance measurements
 ========================
@@ -150,7 +132,7 @@ page provides reference throughput and serving measurements for inferencing popu
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

 System validation
 =================
@@ -167,32 +149,28 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.

-Pull the Docker image
-=====================
-
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

-   {% set docker = data.dockers[0] %}
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}

-   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
   Use the following command to pull the Docker image from Docker Hub.

   .. code-block:: shell

-      docker pull {{ docker.pull_tag }}
+      docker pull {{ unified_docker.pull_tag }}

-Benchmarking
-============
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
+   Benchmarking
+   ============

   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad-930:
+   .. _vllm-benchmark-mad-812:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -203,9 +181,6 @@ Benchmarking

         .. tab-item:: MAD-integrated benchmarking

-            The following run command is tailored to {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
-
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.

@@ -215,9 +190,8 @@ Benchmarking
                  cd MAD
                  pip install -r requirements.txt

-            2. On the host machine, use this command to run the performance benchmark test on
-               the `{{model.model}} <{{ model.url }}>`_ model using one node with the
-               :literal:`{{model.precision}}` data type.
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.

               .. code-block:: shell

@@ -225,7 +199,8 @@ Benchmarking
                  madengine run \
                      --tags {{model.mad_tag}} \
                      --keep-model-dir \
-                      --live-output
+                      --live-output \
+                      --timeout 28800

            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
@@ -233,7 +208,7 @@ Benchmarking
            and ``{{ model.mad_tag }}_serving.csv``.

            Although the :ref:`available models
-            <vllm-benchmark-available-models-930>` are preconfigured to collect
+            <vllm-benchmark-available-models>` are preconfigured to collect
            offline throughput and online serving performance data, you can
            also change the benchmarking parameters. See the standalone
            benchmarking tab for more information.
@@ -257,142 +232,132 @@ Benchmarking

         .. tab-item:: Standalone benchmarking

-            The following commands are optimized for {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
+            .. rubric:: Download the Docker image and required scripts

-            .. seealso::
-
-               For more information on configuration, see the `config files
-               <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
-               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
-               for descriptions of available configuration options
-               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
-               additional benchmarking information.
-
-            .. rubric:: Launch the container
-
-            You can run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
-            in the following snippet.
-
-            .. code-block:: shell
-
-               docker pull {{ docker.pull_tag }}
-               docker run -it \
-                   --device=/dev/kfd \
-                   --device=/dev/dri \
-                   --group-add video \
-                   --shm-size 16G \
-                   --security-opt seccomp=unconfined \
-                   --security-opt apparmor=unconfined \
-                   --cap-add=SYS_PTRACE \
-                   -v $(pwd):/workspace \
-                   --env HUGGINGFACE_HUB_CACHE=/workspace \
-                   --name test \
-                   {{ docker.pull_tag }}
-
-            .. rubric:: Throughput command
-
-            Use the following command to start the throughput benchmark.
-
-            .. code-block:: shell
-
-               model={{ model.model_repo }}
-               tp={{ model.config.tp }}
-               num_prompts={{ model.config.num_prompts | default(1024) }}
-               in={{ model.config.in | default(128) }}
-               out={{ model.config.in | default(128) }}
-               dtype={{ model.config.dtype | default("auto") }}
-               kv_cache_dtype={{ model.config.kv_cache_dtype }}
-               max_num_seqs={{ model.config.max_num_seqs | default(1024) }}
-               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-               max_model_len={{ model.config.max_model_len }}
-
-               vllm bench throughput --model $model \
-                   -tp $tp \
-                   --num-prompts $num_prompts \
-                   --input-len $in \
-                   --output-len $out \
-                   --dtype $dtype \
-                   --kv-cache-dtype $kv_cache_dtype \
-                   --max-num-seqs $max_num_seqs \
-                   --max-num-batched-tokens $max_num_batched_tokens \
-                   --max-model-len $max_model_len \
-                   --trust-remote-code \
-                   --output-json ${model}_throughput.json \
-                   --gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }}
-
-            .. rubric:: Serving command
-
-            1. Start the server using the following command:
+            1. Run the vLLM benchmark tool independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`_
+               as shown in the following snippet.

               .. code-block:: shell

-                  model={{ model.model_repo }}
-                  tp={{ model.config.tp }}
-                  dtype={{ model.config.dtype }}
-                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
-                  max_num_seqs=256
-                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-                  max_model_len={{ model.config.max_model_len }}
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}

-                  vllm serve $model \
-                      -tp $tp \
-                      --dtype $dtype \
-                      --kv-cache-dtype $kv_cache_dtype \
-                      --max-num-seqs $max_num_seqs \
-                      --max-num-batched-tokens $max_num_batched_tokens \
-                      --max-model-len $max_model_len \
-                      --no-enable-prefix-caching \
-                      --swap-space 16 \
-                      --disable-log-requests \
-                      --trust-remote-code \
-                      --gpu-memory-utilization 0.9
-
-               Wait until the model has loaded and the server is ready to accept requests.
-
-            2. On another terminal on the same machine, run the benchmark:
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/vllm``.

               .. code-block:: shell

-                  # Connect to the container
-                  docker exec -it test bash
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/vllm

-                  # Wait for the server to start
-                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
-
-                  # Run the benchmark
-                  model={{ model.model_repo }}
-                  max_concurrency=1
-                  num_prompts=10
-                  in=128
-                  out=128
-                  vllm bench serve --model $model \
-                      --percentile-metrics "ttft,tpot,itl,e2el" \
-                      --dataset-name random \
-                      --ignore-eos \
-                      --max-concurrency $max_concurrency \
-                      --num-prompts $num_prompts \
-                      --random-input-len $in \
-                      --random-output-len $out \
-                      --trust-remote-code \
-                      --save-result \
-                      --result-filename ${model}_serving.json
-
-            .. note::
-
-               For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
-               try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
+            3. To start the benchmark, use the following command with the appropriate options.

               .. code-block::

-                  OSError: You are trying to access a gated repo.
+                  ./run.sh \
+                      --config $CONFIG_CSV \
+                      --model_repo {{ model.model_repo }} \
+                      <overrides>

-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``--config``
+                       - ``configs/default.csv``
+                       - Run configs from the CSV for the chosen model repo and benchmark.
+
+                     * -
+                       - ``configs/extended.csv``
+                       - 
+
+                     * -
+                       - ``configs/performance.csv``
+                       - 
+
+                     * - ``--benchmark``
+                       - ``throughput``
+                       - Measure offline end-to-end throughput.
+
+                     * - 
+                       - ``serving``
+                       - Measure online serving performance.
+
+                     * - 
+                       - ``all``
+                       - Measure both throughput and serving.
+
+                     * - `<overrides>`
+                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
+                       - Additional overrides to the config CSV.
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               .. note::
+
+                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+
+                  If you encounter the following error, pass your access-authorized Hugging
+                  Face token to the gated models.
+
+                  .. code-block::
+
+                     OSError: You are trying to access a gated repo.
+
+                     # pass your HF_TOKEN
+                     export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark throughput
+
+              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
+
+            * Serving benchmark
+
+              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark serving
+
+              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.

            .. raw:: html

@@ -417,36 +382,31 @@ Advanced usage
 ==============

 For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
+see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.

 Reproducing the Docker image
 ----------------------------

-To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
+To reproduce this ROCm/vLLM Docker image release, follow these steps:

-1. Clone the `vLLM repository <https://github.com/vllm-project/vllm>`__.
+1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/vllm.git
+
+2. Checkout the specific release commit.

   .. code-block:: shell

-      git clone https://github.com/vllm-project/vllm.git
      cd vllm
+      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978

-2. Use the following command to build the image directly from the specified commit.
+3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.

-   .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+   .. code-block:: shell

-      {% set docker = data.dockers[0] %}
-      .. code-block:: shell
-
-         docker build -f docker/Dockerfile.rocm \
-             --build-arg REMOTE_VLLM=1 \
-             --build-arg VLLM_REPO=https://github.com/ROCm/vllm \
-             --build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \
-             -t vllm-rocm .
-
-   .. tip::
-
-      Replace ``vllm-rocm`` with your desired image tag.
+      docker build -f docker/Dockerfile.rocm -t vllm-rocm .

 Further reading
 ===============
@@ -457,14 +417,17 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-  a brief introduction to vLLM and optimization strategies.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
 - For a list of other ready-made Docker images for AI with ROCm, see
  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -22,9 +22,9 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
 <rocm-install-on-linux:install/quick-start>`.

 If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
-`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.
+`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.

-You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
+You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
 distribution's package manager. See the following documentation resources to get started:

 * :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
@@ -47,7 +47,7 @@ Deep learning frameworks
 ========================

 ROCm supports deep learning frameworks and libraries including `PyTorch
-<https://pytorch.org>`_, `TensorFlow
+<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
 <https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and more.

 Review the :doc:`framework installation documentation <../deep-learning-rocm>`. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
@@ -57,4 +57,4 @@ Next steps
 ==========

 After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
-to test the optimal performance of your AMD hardware. See :doc:`system-setup/index` to get started.
+to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.
--- a/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
@@ -1,14 +1,12 @@
-:orphan:
-
 .. meta::
   :description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
   :keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference

 .. _rocm-for-ai-system-health-bench:

-*****************************************
-System health benchmarks for AI workloads
-*****************************************
+************************
+System health benchmarks
+************************

 Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).

@@ -33,7 +31,7 @@ installed, run the following command:
   sudo apt install rocm-validation-suite

 See the `ROCm Validation Suite installation instructions <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/install/installation.html>`_,
-and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html>`_
+and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#system-validation-tests>`_
 in the Instinct documentation for more detailed instructions.

 Benchmark, stress, and qualification tests
@@ -43,7 +41,7 @@ The GPU stress test runs various GEMM computations as workloads to stress the GP
 meets the configured target GFLOPS.

 Run the benchmark, stress, and qualification tests included with RVS. See the `Benchmark, stress, qualification
-<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#benchmark-stress-qualification>`_
+<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#benchmark-stress-qualification>`_
 section of the Instinct documentation for usage instructions.

 BabelStream test
@@ -55,7 +53,7 @@ BabelStream tests are included with the RVS package as part of the `BABEL module
 <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/conceptual/rvs-modules.html#babel-benchmark-test-babel-module>`_.

 For more information, see `Performance benchmarking
-<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#babelstream>`_
+<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#babelstream-benchmarking-results>`_
 in the Instinct documentation.

 RCCL tests
@@ -64,7 +62,7 @@ RCCL tests
 The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
 communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
 the performance and verifies the correctness of these collective operations.
-This helps ensure optimal scaling for multi-GPU tasks.
+This helps ensure optimal scaling for multi-accelerator tasks.

 1. To get started, build RCCL-tests using the official instructions in the README at
   `<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
@@ -77,8 +75,8 @@ This helps ensure optimal scaling for multi-GPU tasks.
      make

 2. Run the suggested RCCL tests -- see `RCCL benchmarking
-   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/rdma-benchmarking.html#rccl-benchmarking-results>`_
-   in the AMD Instinct customer acceptance guide.
+   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#rccl-benchmarking-results>`_
+   in the Instinct performance benchmarking documentation for instructions.

 TransferBench test
 ==================
--- a/docs/how-to/rocm-for-ai/system-setup/index.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/index.rst
@@ -1,40 +0,0 @@
-.. meta::
-   :description: System setup and validation steps for AI training and inference on ROCm
-   :keywords: AMD Instinct, ROCm, GPU, AI, training, inference, benchmarking, performance, validation
-
-*************************************
-System setup for AI workloads on ROCm
-*************************************
-
-Before you begin training or inference on AMD Instinct™ GPUs, complete
-the following system setup and validation steps to ensure optimal performance.
-
-Prerequisite system validation
-==============================
-
-First, confirm that your system meets all software and hardware prerequisites.
-See :doc:`prerequisite-system-validation`.
-
-Docker images for AMD Instinct GPUs
-===================================
-
-AMD provides prebuilt Docker images for AMD Instinct™ MI300X and MI325X
-GPUs. These images include ROCm-enabled deep learning frameworks and
-essential software components. They support single-node and multi-node configurations
-and are ready for training and inference workloads out of the box.
-
-Multi-node training
-------------------
-
-For instructions on enabling multi-node training, see :doc:`multi-node-setup`.
-
-System optimization and validation
-==================================
-
-Before running workloads, verify that the system is configured correctly and
-operating at peak efficiency. Recommended steps include:
-
- Disabling NUMA auto-balancing
- Running system benchmarks to validate hardware performance
-
-For details on running system health checks, see :doc:`system-health-check`.
--- a/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
@@ -1,320 +0,0 @@
-.. meta::
-   :description: Multi-node setup for AI training
-   :keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training
-
-.. _rocm-for-ai-multi-node-setup:
-
-*********************************
-Multi-node setup for AI workloads
-*********************************
-
-AMD provides ready-to-use Docker images for AMD Instinct™ MI300X and MI325X
-GPUs containing ROCm-capable deep learning frameworks and essential
-software components. These Docker images can run and leverage multiple nodes if
-they are available. This page describes how to enable the multi-node training
-of AI workloads on AMD Instinct GPUs.
-
-Prerequisites
-=============
-
-Before starting, ensure your environment meets the following requirements:
-
-* Multi-node networking: your cluster should have a configured multi-node network. For setup
-  instructions, see the `Multi-node network configuration for AMD Instinct
-  accelerators
-  <https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html>`__
-  guide in the Instinct documentation.
-
-* ROCm Docker container to simplify environment setup for AI workloads. See the following resources to get started:
-
-  * :doc:`Training a model with Megatron-LM and ROCm <../training/benchmark-docker/megatron-lm>`
-
-  * :doc:`Training a model with PyTorch and ROCm <../training/benchmark-docker/pytorch-training>`
-
-  * :doc:`Training a model with JAX MaxText and ROCm <../training/benchmark-docker/jax-maxtext>`
-
-* Slurm workload manager to run the :ref:`provided examples <multi-node-setup-training-examples>`.
-
-Install required packages
-=========================
-
-To run multi-node workloads, ensure you have all the required packages installed based on your
-network device. For example, on Ubuntu systems:
-
-.. code-block:: shell
-
-   apt install -y iproute2
-
-   apt install -y linux-headers-"$(uname -r)" libelf-dev
-
-   apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
-
-Compile and install the RoCE library
------------------------------------
-
-If you're using Broadcom NICs, you need to compile and install the RoCE (RDMA
-over Converged Ethernet) library. See `RoCE cluster network configuration guide
-for AMD Instinct accelerators
-<https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/roce-network-config.html#roce-cluster-network-configuration-guide-for-amd-instinct-accelerators>`__
-for more information.
-
-See the `Ethernet networking guide for AMD
-Instinct MI300X GPU clusters: Compiling Broadcom NIC software from source
-<https://docs.broadcom.com/doc/957608-AN2XX#page=81>`_ for more details.
-
-.. important::
-
-   It is crucial to install the exact same version of the RoCE library that
-   is installed on your host system. Also, ensure that the path to these
-   libraries on the host is correctly mounted into your Docker container.
-   Failure to do so can lead to compatibility issues and communication
-   failures.
-
-1. Set ``BUILD_DIR`` to the path on the host system where the Broadcom drivers and ``bnxt_rocelib`` source are located.
-   Then, navigate to the ``bnxt_rocelib`` directory.
-
-   .. code-block:: shell
-
-      export BUILD_DIR=/path/to/your/broadcom_drivers_on_host
-      cd $BUILD_DIR/drivers_linux/bnxt_rocelib/
-
-2. The ``bnxt_rocelib`` directory contains a version of ``libbnxt_re`` in a zipped ``.tar.gz`` file.
-
-   .. code-block:: shell
-
-      tar -xf libbnxt_re-a.b.c.d.tar.gz
-      cd libbnxt_re-a.b.c.d
-
-3. Compile and install the RoCE library.
-
-   .. code-block:: shell
-
-      sh autogen.sh
-      ./configure
-      make
-      find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \;
-      make install all
-      sh -c "echo /usr/local/lib >> /etc/ld.so.conf"
-      ldconfig
-      cp -f bnxt_re.driver /etc/libibverbs.d/
-      find . -name "*.so" -exec md5sum {} \;
-      BUILT_MD5SUM=$(find . -name "libbnxt_re-rdmav*.so" -exec md5sum {} \; | cut -d " " -f 1)
-
-Environment setup
-=================
-
-Before running multi-node workloads, set these essential environment variables:
-
-Master address
--------------
-
-By default, ``localhost`` is used for single-node configurations. Change
-``localhost`` to the master node's resolvable hostname or IP address:
-
-.. code-block:: bash
-
-   export MASTER_ADDR="${MASTER_ADDR:-localhost}"
-
-Number of nodes
---------------
-
-Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
-
-.. code-block:: bash
-
-   export NNODES="${NNODES:-<num_nodes>}"
-
-Node ranks
----------
-
-Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on).
-Node ranks should be unique across all nodes in the cluster.
-
-.. code-block:: bash
-
-   export NODE_RANK="${NODE_RANK:-<node_rank>}"
-
-Network interface
-----------------
-
-Update the network interface in the script to match your system's network interface. To
-find your network interface, run the following (outside of any Docker container):
-
-.. code-block:: bash
-
-   ip a
-
-Look for an active interface (status "UP") with an IP address in the same subnet as
-your other nodes. Then, update the following variable in the script, for
-example:
-
-.. code-block:: bash
-
-   export NCCL_SOCKET_IFNAME=ens50f0np0
-
-This variable specifies which network interface to use for inter-node communication.
-Setting this variable to the incorrect interface can result in communication failures
-or significantly reduced performance.
-
-.. tip::
-
-  This command sets ``NCCL_SOCKET_IFNAME``'s value to the last RDMA interface.
-
-  .. code-block:: bash
-
-     export NCCL_SOCKET_IFNAME=$(rdma link show | awk '{print $NF}' | sort | tail -n1)
-
-RDMA/IB interface
-----------------
-
-Set the RDMA interfaces to be used for communication. NICs can come from different vendors and the names of the RDMA interface can be different. To get the list of all the RDMA/IB devices, run:
-
-.. code-block:: bash
-
-   ibv_devices
-
-The command below gets the list of all RDMA/IB devices and puts them in a
-comma-separated format. If
-(``rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7``) are your RDMA
-interfaces, then set:
-
-.. code-block:: bash
-
-   # If using Broadcom NIC
-   export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
-   # If using Mellanox NIC
-   # export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
-
-.. tip::
-
-  Alternatively, if you want to choose the RDMA interface automatically, you
-  can use the following. This command will sort the RDMA interfaces and then
-  select the first eight RDMA interfaces.
-
-  .. code-block:: bash
-
-     export NCCL_IB_HCA=$(ibv_devices | awk 'NR>2 {print $1}' | sort | head -n 8 | paste -sd,)
-
-Global ID index
---------------
-
-Update the global ID index if you're using RoCE.
-
-.. code-block:: bash
-
-   export NCCL_IB_GID_INDEX=3
-
-.. _multi-node-setup-training-examples:
-
-Multi-node training examples
-============================
-
-The following examples use the Slurm workload manager to launch jobs on
-multiple nodes. To run these scripts as-is, you must have a Slurm environment
-configured. The scripts are designed to work with both Broadcom Thor 2 and
-Mellanox NICs by automatically installing the required libraries and setting
-the necessary environment variables. For systems with Broadcom NICs, the
-scripts assume the host's RoCE library is located in the ``/opt`` directory.
-
-The following benchmarking examples demonstrate the training of a Llama 3 8B model
-across multiple 8-GPU nodes, using FSDP for intra-node parallelism and DP for
-inter-node parallelism.
-
-.. _rocm-for-ai-multi-node-setup-jax-train-example:
-
-JAX MaxText
-----------
-
-1. Download the desired multi-node benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
-
-   .. code-block:: shell
-
-      wget https://raw.githubusercontent.com/ROCm/MAD/refs/heads/develop/scripts/jax-maxtext/gpu-rocm/llama3_8b_multinode.sh
-
-   Or clone the `<https://github.com/ROCm/MAD>`__ repository.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/MAD
-      cd scripts/jax-maxtext/gpu-rocm
-
-2. Run the benchmark for multi-node training.
-
-   .. code-block:: shell
-
-      sbatch -N <num_nodes> llama3_8b_multinode.sh
-
-.. _rocm-for-ai-multi-node-setup-pyt-train-example:
-
-PyTorch training
----------------
-
-.. note::
-
-   The ROCm PyTorch Training Docker image now focuses on :doc:`Training a model
-   with Primus and PyTorch <../training/benchmark-docker/primus-pytorch>`. The
-   following example refers to the legacy workflow :ref:`Training a
-   model with PyTorch <amd-pytorch-training-multinode-examples>`.
-
-1. Download the ``run_multinode_train.sh`` benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/pytorch_train>`__.
-
-   .. code-block:: shell
-
-      wget https://raw.githubusercontent.com/ROCm/MAD/refs/heads/develop/scripts/pytorch_train/run_multinode_train.sh
-
-   Or clone the `<https://github.com/ROCm/MAD>`__ repository.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/MAD
-      cd scripts/pytorch_train
-
-2. Run the benchmark for multi-node training.
-
-   .. code-block:: shell
-
-      sbatch -N <num_nodes> run_multinode_train.sh
-
-.. seealso::
-
-   See :ref:`Training a model with PyTorch <amd-pytorch-multinode-examples>` for more examples and information.
-
-Megatron-LM
-----------
-
-.. note::
-
-   The Megatron-LM Docker image now focuses on :ref:`Training a model with
-   Primus and Megatron <amd-primus-megatron-multi-node-examples>`. The
-   following example refers to the legacy Megatron-LM :ref:`Training a model
-   with Megatron-LM <amd-megatron-lm-multi-node-examples>` and might have
-   limited support.
-
-1. Download the ``train_llama_slurm.sh`` benchmarking script from
-   `<https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama_slurm.sh>`__.
-
-2. Set the network interface parameters as per the above guidelines and run the script.
-
-   .. code-block:: shell
-
-      cd </path/to/your/Megatron-LM>
-      export NETWORK_INTERFACE=$NCCL_SOCKET_IFNAME
-      export NCCL_IB_HCA=$NCCL_IB_HCA
-      export IMAGE=docker.io/rocm/megatron-lm:latest OR your preferred image
-      export DATA_CACHE_PATH=/nfs/mounted/repo
-
-      sbatch –N <num_nodes> examples/llama/train_llama_slurm.sh <MODEL_SIZE> <MBS> <GBS> <SEQ_LENGTH> <FSDP> <RECOMPUTE>
-
-2. For example, to run a Llama 3 8B workload in BF16 precision, use the following command.
-
-   .. code-block:: shell
-
-      MODEL_NAME=llama3 sbatch –N 8 examples/llama/train_llama_slurm.sh 8 2 128 8192 0 0
-      # Other parameters, such as TP, FP8 datatype, can be adjusted in the script.
-
-Further reading
-===============
-
-* `Multi-node network configuration for AMD Instinct accelerators <https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html>`__
-
-* `Ethernet networking guide for AMD Instinct MI300X GPU clusters: Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#page=81>`__
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -2,114 +2,80 @@
   :description: How to train a model using JAX MaxText for ROCm.
   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker

-******************************************
-Training a model with JAX MaxText on ROCm
-******************************************
+**************************************
+Training a model with MaxText for ROCm
+**************************************

 MaxText is a high-performance, open-source framework built on the Google JAX
 machine learning library to train LLMs at scale. The MaxText framework for
 ROCm is an optimized fork of the upstream
 `<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
-on AMD MI300X series GPUs.
+on AMD MI300X series accelerators.

-The MaxText for ROCm training Docker image
-provides a prebuilt environment for training on AMD Instinct MI300X and MI325X GPUs,
+The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| JAX                      | 0.4.35                         |
+--------------------------+--------------------------------+
+| Python                   | 3.10.12                        |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.13.0-ae9c477a                |
+--------------------------+--------------------------------+

-   {% set dockers = data.dockers %}
-   .. tab-set::
+Supported features and models
+=============================

-      {% for docker in dockers %}
-      {% set jax_version = docker.components["JAX"] %}
-
-      .. tab-item:: ``{{ docker.pull_tag }}``
-         :sync: {{ docker.pull_tag }}
-
-         .. list-table::
-            :header-rows: 1
-
-            * - Software component
-              - Version
-
-            {% for component_name, component_version in docker.components.items() %}
-            * - {{ component_name }}
-              - {{ component_version }}
-
-            {% endfor %}
-         {% if jax_version == "0.6.0" %}
-         .. note::
-
-            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
-            not configured correctly. For now you can turn it off by setting
-            ``shardy=False`` during the training run. You can also follow the `migration
-            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
-            it.
-         {% endif %}
-
-      {% endfor %}
-
-MaxText with on ROCm provides the following key features to train large language models efficiently:
+MaxText provides the following key features to train large language models efficiently:

 - Transformer Engine (TE)

- Flash Attention (FA) 3 -- with or without sequence input packing
+- Flash Attention (FA) 3

 - GEMM tuning

 - Multi-node support

- NANOO FP8 quantization support
+.. _amd-maxtext-model-support:

-.. _amd-maxtext-model-support-v257:
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

-Supported models
-================
+* Llama 3.3 70B

-The following models are pre-optimized for performance on AMD Instinct MI300
-series GPUs. Some instructions, commands, and available training
-configurations in this documentation might vary by model -- select one to get
-started.
+* Llama 3.1 8B

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+* Llama 3.1 70B

-   {% set model_groups = data.model_groups %}
-   .. raw:: html
+* Llama 3 8B

-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
+* Llama 3 70B

-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite

 .. note::

   Some models, such as Llama 3, require an external license agreement through
   a third party (for example, Meta).

+Unsupported features
+--------------------
+
+Currently, MaxText's default packed input format is not supported. Using this format
+with the current Docker image results in incorrect attention calculations
+across different input sequences. Support for packed input format is planned for a future release.
+
 System validation
 =================

@@ -132,225 +98,278 @@ This Docker image is optimized for specific model configurations outlined
 as follows. Performance can vary for other training workloads, as AMD
 doesn’t validate configurations and run conditions outside those described.

+.. _amd-maxtext-multi-node-setup:
+
+Multi-node setup
+----------------
+
+For multi-node environments, ensure you have all the necessary packages for
+your network device, such as, RDMA. If you're not using a multi-node setup
+with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+
+1. Install the following packages to build and install the RDMA driver.
+
+   .. code-block:: shell
+
+      sudo apt install iproute2 -y
+      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
+      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
+
+   Refer to your NIC manufacturer's documentation for further steps on
+   compiling and installing the RoCE driver. For example, for Broadcom,
+   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
+   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
+
+2. Set the following environment variables.
+
+   a. Master address
+
+      Change ``localhost`` to the master node's resolvable hostname or IP address:
+
+      .. code-block:: bash
+
+         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   b. Number of nodes
+
+      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+      .. code-block:: bash
+
+         export NNODES="${NNODES:-1}"
+
+   c. Node ranks
+
+      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
+      Node ranks should be unique across all nodes in the cluster.
+
+      .. code-block:: bash
+
+         export NODE_RANK="${NODE_RANK:-0}"
+
+   d. Network interface
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface with an IP address in the same subnet as
+      your other nodes. Then, update the following variable in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      This variable specifies which network interface to use for inter-node communication.
+      Setting this variable to the incorrect interface can result in communication failures
+      or significantly reduced performance.
+
+   e. RDMA interface
+
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Then, set the RDMA interfaces to use for communication.
+
+      .. code-block:: bash
+
+         # If using Broadcom NIC
+         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+         # If using Mellanox NIC
+         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. _amd-maxtext-download-docker:
+
 Pull the Docker image
 ---------------------

-Use the following command to pull the Docker image from Docker Hub.
+1. Use the following command to pull the Docker image from Docker Hub.

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+   .. code-block:: shell

-   {% set dockers = data.dockers %}
-   .. tab-set::
+      docker pull rocm/jax-training:maxtext-v25.5

-      {% for docker in dockers %}
-      {% set jax_version = docker.components["JAX"] %}
+2. Use the following command to launch the Docker container. Note that the benchmarking scripts
+   used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
+   and execute the benchmark.

-      .. tab-item:: JAX {{ jax_version }}
-         :sync: {{ docker.pull_tag }}
+   .. code-block:: shell

-         .. code-block:: shell
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5

-            docker pull {{ docker.pull_tag }}
+.. _amd-maxtext-get-started:

-      {% endfor %}
-
-.. _amd-maxtext-multi-node-setup-v257:
-
-Multi-node configuration
------------------------
-
-See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
-environment for multi-node training.
-
-.. _amd-maxtext-get-started-v257:
-
-Benchmarking
-============
-
-Once the setup is complete, choose between two options to reproduce the
-benchmark results:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
-
-   .. _vllm-benchmark-mad:
-
-   {% set dockers = data.dockers %}
-   {% set model_groups = data.model_groups %}
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         {% if model.mad_tag and "single-node" in model.doc_options %}
-         .. tab-item:: MAD-integrated benchmarking
-
-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
-
-            2. Use this command to run the performance benchmark test on the {{ model.model }} model
-               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{model.mad_tag}} \
-                      --keep-model-dir \
-                      --live-output \
-                      --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/perf.csv/``.
-         {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            .. rubric:: Download the Docker image and required scripts
-
-            Run the JAX MaxText benchmark tool independently by starting the
-            Docker container as shown in the following snippet.
-
-            .. tab-set::
-               {% for docker in dockers %}
-               {% set jax_version = docker.components["JAX"] %}
-
-               .. tab-item:: JAX {{ jax_version }}
-                  :sync: {{ docker.pull_tag }}
-
-                  .. code-block:: shell
-
-                     docker pull {{ docker.pull_tag }}
-               {% endfor %}
-
-            {% if model.model_repo and "single-node" in model.doc_options %}
-            .. rubric:: Single node training
-
-            1. Set up environment variables.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
-                  export HF_HOME=<Location of saved/cached Hugging Face models>
-
-               ``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
-               See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
-
-               ``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
-               If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
-               Downloaded files typically get cached to ``~/.cache/huggingface``.
-
-            2. Launch the Docker container.
-
-               .. tab-set::
-                  {% for docker in dockers %}
-                  {% set jax_version = docker.components["JAX"] %}
-
-                  .. tab-item:: JAX {{ jax_version }}
-                     :sync: {{ docker.pull_tag }}
-
-                     .. code-block:: shell
-
-                        docker run -it \
-                            --device=/dev/dri \
-                            --device=/dev/kfd \
-                            --network host \
-                            --ipc host \
-                            --group-add video \
-                            --cap-add=SYS_PTRACE \
-                            --security-opt seccomp=unconfined \
-                            --privileged \
-                            -v $HOME:$HOME \
-                            -v $HOME/.ssh:/root/.ssh \
-                            -v $HF_HOME:/hf_cache \
-                            -e HF_HOME=/hf_cache \
-                            -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
-                            --shm-size 64G \
-                            --name training_env \
-                            {{ docker.pull_tag }}
-                  {% endfor %}
-
-            3. In the Docker container, clone the ROCm MAD repository and navigate to the
-               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD/scripts/jax-maxtext
-
-            4. Run the setup scripts to install libraries and datasets needed
-               for benchmarking.
-
-               .. code-block:: shell
-
-                  ./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
-
-            5. To run the training benchmark without quantization, use the following command:
-
-               .. code-block:: shell
-
-                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
-
-               For quantized training, use the following command:
-
-               .. code-block:: shell
-
-                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
-
-            {% endif %}
-            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
-            .. rubric:: Multi-node training
-
-            The following examples use SLURM to run on multiple nodes.
-
-            .. note::
-
-               The following scripts will launch the Docker container and run the
-               benchmark. Run them outside of any Docker container.
-
-            1. Make sure ``$HF_HOME`` is set before running the test. See
-               `ROCm benchmarking <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/readme.md>`__
-               for more details on downloading the Llama models before running the
-               benchmark.
-
-            2. To run multi-node training for {{ model.model }},
-               use the
-               `multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
-               under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
-
-            3. Run the multi-node training benchmark script.
-
-               .. code-block:: shell
-
-                  sbatch -N <num_nodes> {{ model.multinode_training_script }}
-
-         {% else %}
-            .. rubric:: Multi-node training
-
-            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
-            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-
-Further reading
+Getting started
 ===============

- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+The following examples demonstrate how to get started with single node
+and multi-node training using the benchmarking scripts provided at
+`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.

- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+.. important::

- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+
+Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
+set correctly and points to your Hugging Face cache directory. Refer to the
+README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
+for more detailed instructions.
+
+Single node training benchmarking examples
+------------------------------------------
+
+* Example 1: Single node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
+
+* Example 2: Single node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
+
+* Example 3: Single node training with Llama 3 8B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
+
+* Example 4: Single node training with Llama 3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
+
+* Example 5: Single node training with Llama 3.3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
+
+* Example 6: Single node training with DeepSeek V2 16B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
+
+  .. note::
+
+     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
+     the tokens/s as a performance indicator.
+
+Multi-node training benchmarking examples
+-----------------------------------------
+
+The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
+own cluster setup.
+
+* Example 1: Multi-node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_7b_multinode.sh
+
+* Example 2: Multi-node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_70b_multinode.sh
+
+* Example 3: Multi-node training with Llama 3 8B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_8b_multinode.sh
+
+* Example 4: Multi-node training with Llama 3 70B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_70b_multinode.sh

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -5,25 +5,27 @@
   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch

 ******************************************
-Training a model with Megatron-LM on ROCm
+Training a model with Megatron-LM for ROCm
 ******************************************

 .. caution::

-   Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
-   To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
+   The ROCm Megatron-LM framework now has limited support with this Docker
+   environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
+
+   To learn how to migrate your existing workloads to Primus with Megatron-Core,
   see :doc:`previous-versions/megatron-lm-primus-migration-guide`.

 The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
 a specialized fork of the robust Megatron-LM, designed to enable efficient
 training of large-scale language models on AMD GPUs. By leveraging AMD
-Instinct™ MI300X series GPUs, Megatron-LM delivers enhanced
+Instinct™ MI300X series accelerators, Megatron-LM delivers enhanced
 scalability, performance, and resource utilization for AI workloads. It is
 purpose-built to support models like Llama, DeepSeek, and Mixtral,
 enabling developers to train next-generation AI models more
 efficiently.

-AMD provides ready-to-use Docker images for MI300X series GPUs containing
+AMD provides ready-to-use Docker images for MI300X series accelerators containing
 essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:
@@ -61,39 +63,39 @@ workloads:
   ================

   The following models are supported for training performance benchmarking with Megatron-LM and ROCm
-   on AMD Instinct MI300X series GPUs.
+   on AMD Instinct MI300X series accelerators.
   Some instructions, commands, and training recommendations in this documentation might
   vary by model -- select one to get started.

   {% set model_groups = data.model_groups %}
   .. raw:: html

-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
+         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+           <div class="row">
+             <div class="col-2 me-2 model-param-head">Model</div>
+             <div class="row col-10">
      {% for model_group in model_groups %}
-               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
-            </div>
-         </div>
+             </div>
+           </div>

-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
+           <div class="row mt-1">
+             <div class="col-2 me-2 model-param-head">Model variant</div>
+             <div class="row col-10">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% endif %}
         {% endfor %}
      {% endfor %}
-            </div>
+             </div>
+           </div>
         </div>
-      </div>

 .. note::

@@ -115,7 +117,7 @@ popular AI models.
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
   only reflects the latest version of this training benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

 System validation
 =================
@@ -138,11 +140,11 @@ Environment setup
 =================

 Use the following instructions to set up the environment, configure the script to train models, and
-reproduce the benchmark results on MI300X series GPUs with the AMD Megatron-LM Docker
+reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
 image.

 .. _amd-megatron-lm-requirements:
-
+ 
 Download the Docker image
 -------------------------

@@ -152,7 +154,7 @@ Download the Docker image
   1. Use the following command to pull the Docker image from Docker Hub.

      {% if dockers|length > 1 %}
-      .. tab-set::
+      .. tab-set:: 

         {% for docker in data.dockers %}
         .. tab-item:: {{ docker.doc_name }}
@@ -281,11 +283,25 @@ Configuration

   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars>` for more information on configuration options.

-Multi-node configuration
------------------------
+Network interface
+-----------------

-Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
-training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
+Update the network interface in the script to match your system's network interface. To
+find your network interface, run the following (outside of any Docker container):
+
+.. code-block:: bash
+
+   ip a
+
+Look for an active interface that has an IP address in the same subnet as
+your other nodes. Then, update the following variables in the script, for
+example:
+
+.. code-block:: bash
+
+   export NCCL_SOCKET_IFNAME=ens50f0np0
+
+   export GLOO_SOCKET_IFNAME=ens50f0np0

 .. _amd-megatron-lm-tokenizer:

@@ -526,6 +542,46 @@ Download the dataset

   Ensure that the files are accessible inside the Docker container.

+Multi-node configuration
+------------------------
+
+If you're running multi-node training, update the following environment variables. They can
+also be passed as command line arguments. Refer to the following example configurations.
+
+* Change ``localhost`` to the master node's hostname:
+
+  .. code-block:: shell
+
+     MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+  .. code-block:: shell
+
+     NNODES="${NNODES:-1}"
+
+* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+  .. code-block:: shell
+
+     NODE_RANK="${NODE_RANK:-0}"
+
+* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
+  NFS directory) for multi-node runs:
+
+  .. code-block:: shell
+
+     DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
+
+* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
+  inside a Docker container, either install the drivers inside the Docker container or pass the network
+  drivers from the host while creating the Docker container.
+
+  .. code-block:: shell
+
+     # Specify which RDMA interfaces to use for communication
+     export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+
 .. _amd-megatron-lm-run-training:

 Run training
@@ -533,7 +589,7 @@ Run training

 Use the following example commands to set up the environment, configure
 :ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
-MI300X series GPUs with the AMD Megatron-LM environment.
+MI300X series accelerators with the AMD Megatron-LM environment.

 Single node training
 --------------------
@@ -558,7 +614,7 @@ Single node training
      FSDP=1 \
      MODEL_SIZE=70 \
      TOTAL_ITERS=50 \
-      bash examples/llama/train_llama3.sh
+      bash examples/llama/train_llama3.sh 

   .. note::

@@ -716,7 +772,7 @@ Single node training

 .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy

-   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
   navigate to the Megatron-LM folder and use the following command.

   .. code-block:: shell
@@ -751,16 +807,9 @@ Single node training
      AC=none \
      SEQ_LEN=4096 \
      PAD_LEN=4096 \
-      TRAIN_ITERS=20 \
+      TRAIN_ITERS=50 \
      bash examples/deepseek_v2/train_deepseekv2.sh

-   .. note::
-
-      Note that DeepSeek-V2-Lite is experiencing instability due to GPU memory access fault
-      for large iterations.
-      For stability, it's recommended to use Primus for this workload.
-      See :doc:`primus-megatron`.
-
 .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b

   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
@@ -871,8 +920,6 @@ Single node training
          RECOMPUTE_ACTIVATIONS=full \
          CKPT_FORMAT=torch_dist

-.. _amd-megatron-lm-multi-node-examples:
-
 Multi-node training examples
 ----------------------------

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
@@ -3,7 +3,7 @@
   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker

 ******************************************
-Training MPT-30B with LLM Foundry on ROCm
+Training MPT-30B with LLM Foundry and ROCm
 ******************************************

 MPT-30B is a 30-billion parameter decoder-style transformer-based model from
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -17,21 +17,12 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
     - Components
     - Resources

-   * - 25.7 (latest)
-     - 
-       * ROCm 6.4.1
-       * JAX 0.6.0, 0.5.0
-     - 
-       * :doc:`Documentation <../jax-maxtext>`
-       * `Docker Hub (JAX 0.6.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
-       * `Docker Hub (JAX 0.5.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025>`__
-
-   * - 25.5
+   * - 25.5 (latest)
     - 
       * ROCm 6.3.4
       * JAX 0.4.35
     - 
-       * :doc:`Documentation <jax-maxtext-v25.5>`
+       * :doc:`Documentation <../jax-maxtext>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__

   * - 25.4
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
@@ -51,7 +51,7 @@ MaxText provides the following key features to train large language models effic

 - Multi-node support

-.. _amd-maxtext-model-support-v254:
+.. _amd-maxtext-model-support:

 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

@@ -202,14 +202,16 @@ Getting started

 The following examples demonstrate how to get started with single node
 and multi-node training using the benchmarking scripts provided at
-`<https://github.com/ROCm/maxtext/>`__.
+`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.

 .. important::

   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.

 Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
-set correctly and points to your Hugging Face cache directory.
+set correctly and points to your Hugging Face cache directory. Refer to the
+README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
+for more detailed instructions.

 Single node training benchmarking examples
 ------------------------------------------
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
@@ -1,383 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using JAX MaxText for ROCm.
-   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
-
-**************************************
-Training a model with MaxText for ROCm
-**************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm JAX MaxText
-   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
-
-MaxText is a high-performance, open-source framework built on the Google JAX
-machine learning library to train LLMs at scale. The MaxText framework for
-ROCm is an optimized fork of the upstream
-`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
-on AMD MI300X series accelerators.
-
-The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
-provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
-including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
-It includes the following software components:
-
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| JAX                      | 0.4.35                         |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.12                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.13.0-ae9c477a                |
-+--------------------------+--------------------------------+
-
-Supported features and models
-=============================
-
-MaxText provides the following key features to train large language models efficiently:
-
- Transformer Engine (TE)
-
- Flash Attention (FA) 3
-
- GEMM tuning
-
- Multi-node support
-
-.. _amd-maxtext-model-support-v255:
-
-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
-
-* Llama 3.3 70B
-
-* Llama 3.1 8B
-
-* Llama 3.1 70B
-
-* Llama 3 8B
-
-* Llama 3 70B
-
-* Llama 2 7B
-
-* Llama 2 70B
-
-* DeepSeek-V2-Lite
-
-.. note::
-
-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
-
-Unsupported features
--------------------
-
-Currently, MaxText's default packed input format is not supported. Using this format
-with the current Docker image results in incorrect attention calculations
-across different input sequences. Support for packed input format is planned for a future release.
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-Environment setup
-=================
-
-This Docker image is optimized for specific model configurations outlined
-as follows. Performance can vary for other training workloads, as AMD
-doesn’t validate configurations and run conditions outside those described.
-
-.. _amd-maxtext-multi-node-setup-v255:
-
-Multi-node setup
----------------
-
-For multi-node environments, ensure you have all the necessary packages for
-your network device, such as, RDMA. If you're not using a multi-node setup
-with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v255`.
-
-1. Install the following packages to build and install the RDMA driver.
-
-   .. code-block:: shell
-
-      sudo apt install iproute2 -y
-      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
-      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
-
-   Refer to your NIC manufacturer's documentation for further steps on
-   compiling and installing the RoCE driver. For example, for Broadcom,
-   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
-   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
-
-2. Set the following environment variables.
-
-   a. Master address
-
-      Change ``localhost`` to the master node's resolvable hostname or IP address:
-
-      .. code-block:: bash
-
-         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
-
-   b. Number of nodes
-
-      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
-
-      .. code-block:: bash
-
-         export NNODES="${NNODES:-1}"
-
-   c. Node ranks
-
-      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
-      Node ranks should be unique across all nodes in the cluster.
-
-      .. code-block:: bash
-
-         export NODE_RANK="${NODE_RANK:-0}"
-
-   d. Network interface
-
-      Update the network interface in the script to match your system's network interface. To
-      find your network interface, run the following (outside of any Docker container):
-
-      .. code-block:: bash
-
-         ip a
-
-      Look for an active interface with an IP address in the same subnet as
-      your other nodes. Then, update the following variable in the script, for
-      example:
-
-      .. code-block:: bash
-
-         export NCCL_SOCKET_IFNAME=ens50f0np0
-
-      This variable specifies which network interface to use for inter-node communication.
-      Setting this variable to the incorrect interface can result in communication failures
-      or significantly reduced performance.
-
-   e. RDMA interface
-
-      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v255>` are installed on all nodes.
-      Then, set the RDMA interfaces to use for communication.
-
-      .. code-block:: bash
-
-         # If using Broadcom NIC
-         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
-         # If using Mellanox NIC
-         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
-
-.. _amd-maxtext-download-docker-v255:
-
-Pull the Docker image
---------------------
-
-1. Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/jax-training:maxtext-v25.5
-
-2. Use the following command to launch the Docker container. Note that the benchmarking scripts
-   used in the :ref:`following section <amd-maxtext-get-started-v255>` automatically launch the Docker container
-   and execute the benchmark.
-
-   .. code-block:: shell
-
-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
-
-.. _amd-maxtext-get-started-v255:
-
-Getting started
-===============
-
-The following examples demonstrate how to get started with single node
-and multi-node training using the benchmarking scripts provided at
-`<https://github.com/ROCm/maxtext/>`__.
-
-.. important::
-
-   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
-
-Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
-set correctly and points to your Hugging Face cache directory.
-
-Single node training benchmarking examples
------------------------------------------
-
-* Example 1: Single node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
-
-* Example 2: Single node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
-
-* Example 3: Single node training with Llama 3 8B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
-
-* Example 4: Single node training with Llama 3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
-
-* Example 5: Single node training with Llama 3.3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
-
-* Example 6: Single node training with DeepSeek V2 16B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
-
-  .. note::
-
-     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
-     the tokens/s as a performance indicator.
-
-Multi-node training benchmarking examples
-----------------------------------------
-
-The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
-own cluster setup.
-
-* Example 1: Multi-node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_7b_multinode.sh
-
-* Example 2: Multi-node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_70b_multinode.sh
-
-* Example 3: Multi-node training with Llama 3 8B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_8b_multinode.sh
-
-* Example 4: Multi-node training with Llama 3 70B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_70b_multinode.sh
-
-Previous versions
-=================
-
-See :doc:`jax-maxtext-history` to find documentation for previous releases
-of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,22 +16,12 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
     - Components
     - Resources

-   * - v25.8 (latest)
+   * - v25.7 (latest)
     - 
-       * ROCm 6.4.3
-       * PyTorch 2.8.0a0+gitd06a406
+       * ROCm 
+       * PyTorch 
     - 
-       * :doc:`Primus Megatron documentation <../primus-megatron>`
-       * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
-       * `Docker Hub (py310) <https://hub.docker.com/r/rocm/megatron-lm/tags>`__
-
-   * - v25.7
-     - 
-       * ROCm 6.4.2
-       * PyTorch 2.8.0a0+gitd06a406
-     - 
-       * :doc:`Primus Megatron documentation <primus-megatron-v25.7>`
-       * :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.7>`
+       * :doc:`Documentation <../megatron-lm>`
       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__

   * - v25.6
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
@@ -1,12 +1,12 @@
 :orphan:

-*****************************************************************
-Migrating workloads to Primus (Megatron backend) from Megatron-LM
-*****************************************************************
+**********************************************************************
+Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
+**********************************************************************

 Primus supports Megatron-Core as backend optimization library,
 replacing ROCm Megatron-LM. This document outlines the steps to migrate
-workload from ROCm Megatron-LM to Primus with the Megatron backend.
+workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.

 Model architecture
 ==================
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
 The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
 enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
 accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
-workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
+workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
 like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
 efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.

@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support-24-12:
+.. _amd-megatron-lm-model-support:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support-25-3:
+.. _amd-megatron-lm-model-support:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support-25-4:
+.. _amd-megatron-lm-model-support:

 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
      or the default ``HuggingFaceTokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7.rst
@@ -1,604 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using Megatron-LM for ROCm.
-   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
-
-********************************************
-Training a model with Primus and Megatron-LM
-********************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm Megatron-LM
-   training performance documentation. See :doc:`../primus-megatron` for the latest version.
-
-`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
-LLM training framework designed to streamline training. It streamlines LLM
-training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
-Primus is backend-agnostic and supports multiple training engines -- including Megatron.
-
-.. note::
-
-   Primus with the Megatron backend is intended to replace ROCm
-   Megatron-LM in this Dockerized training environment. To learn how to migrate
-   workloads from Megatron-LM to Primus with Megatron, see
-   :doc:`megatron-lm-primus-migration-guide`.
-
-For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
-containing essential components for Primus and Megatron-LM.
-
-.. note::
-
-   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
-   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <megatron-lm-v25.6>`.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-   {% set docker = dockers[0] %}
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components.items() %}
-      * - {{ component_name }}
-        - {{ component_version }}
-      {% endfor %}
-
-.. _amd-primus-megatron-lm-model-support-v257:
-
-Supported models
-================
-
-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
-Some instructions, commands, and training examples in this documentation might
-vary by model -- select one to get started.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
-
-   {% set model_groups = data.model_groups %}
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-.. note::
-
-   Some models, such as Llama, require an external license agreement through
-   a third party (for example, Meta).
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-.. _mi300x-amd-primus-megatron-lm-training-v257:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-      {% set docker = dockers[0] %}
-
-   Environment setup
-   =================
-
-   Use the following instructions to set up the environment, configure the script to train models, and
-   reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
-
-   .. _amd-primus-megatron-lm-requirements-v257:
-
-   Download the Docker image
-   -------------------------
-
-   1. Use the following command to pull the Docker image from Docker Hub.
-
-      .. code-block:: shell
-
-         docker pull {{ docker.pull_tag }}
-
-   2. Launch the Docker container.
-
-      .. code-block:: shell
-
-         docker run -it \
-             --device /dev/dri \
-             --device /dev/kfd \
-             --device /dev/infiniband \
-             --network host --ipc host \
-             --group-add video \
-             --cap-add SYS_PTRACE \
-             --security-opt seccomp=unconfined \
-             --privileged \
-             -v $HOME:$HOME \
-             --shm-size 128G \
-             --name primus_training_env \
-             {{ docker.pull_tag }}
-
-3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
-
-   .. code-block:: shell
-
-      docker start primus_training_env
-      docker exec -it primus_training_env bash
-
-The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
-<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
-
-.. _amd-primus-megatron-lm-environment-setup-v257:
-
-Configuration
-=============
-
-Primus defines a training configuration in YAML for each model in
-`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
-
-   {% set model_groups = data.model_groups %}
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-   .. container:: model-doc {{ model.mad_tag }}
-
-      To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
-      Note that training configuration YAML files for other models follow this naming convention.
-
-      {% endfor %}
-   {% endfor %}
-
-.. note::
-
-   See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
-
-Dataset options
---------------
-
-You can use either mock data or real data for training.
-
-* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
-  value is ``true`` for enabled.
-
-  .. code-block:: yaml
-
-     mock_data: true
-
-* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
-
-  .. code-block:: bash
-
-     mock_data: false
-     train_data_path: /path/to/your/dataset
-
-  Ensure that the files are accessible inside the Docker container.
-
-.. _amd-primus-megatron-lm-tokenizer-v257:
-
-Tokenizer
---------
-
-In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
-3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
-``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
-<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
-definition. As such, you need to set the ``HF_TOKEN`` environment variable with
-right permissions to access the tokenizer for each model.
-
-.. code-block:: bash
-
-   # Export your HF_TOKEN in the workspace
-   export HF_TOKEN=<your_hftoken>
-
-.. _amd-primus-megatron-lm-run-training-v257:
-
-Run training
-============
-
-Use the following example commands to set up the environment, configure
-:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
-MI300X series accelerators with the AMD Megatron-LM environment.
-
-Single node training
--------------------
-
-To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
-
-.. code-block:: shell
-
-   pip install -r requirements.txt
-   export HSA_NO_SCRATCH_RECLAIM=1
-   export NVTE_CK_USES_BWD_V3=1
-
-Once setup is complete, run the appropriate training command.
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
-
-   To run pre-training for Llama 3.3 70B BF16, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-          --micro_batch_size 2 \
-          --global_batch_size 16 \
-          --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
-
-   To run pre-training for Llama 3.1 8B FP8, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-          --train_iters 50 \
-          --fp8 hybrid
-
-   For Llama 3.1 8B BF16, use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
-
-   To run pre-training for Llama 3.1 70B BF16, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-           --train_iters 50
-
-   To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-          --train_iters 50 \
-          --num_layers 40 \
-          --fp8 hybrid \
-          --no_fp8_weight_transpose_cache true
-
-   .. note::
-
-      Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
-
-   To run pre-training for Llama 2 7B FP8, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-          --train_iters 50 \
-          --fp8 hybrid
-
-   To run pre-training for Llama 2 7B BF16, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
-
-   To run pre-training for Llama 2 70B BF16, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh --train_iters 50 
-
-.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
-
-   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
-   use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
-      bash examples/run_pretrain.sh \
-          --num_layers 3 \
-          --moe_layer_freq 1 \
-          --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
-
-   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
-   use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
-      bash examples/run_pretrain.sh \
-          --global_batch_size 256 \
-          --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
-
-   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
-   use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
-      bash examples/run_pretrain.sh --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
-
-   To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
-   use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
-      bash examples/run_pretrain.sh \
-          --num_layers 4 \
-          --pipeline_model_parallel_size 1 \
-          --micro_batch_size 1 \
-          --global_batch_size 16 \
-          --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
-
-   To run training on a single node for Qwen 2.5 7B BF16, use the following
-   command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
-      bash examples/run_pretrain.sh --train_iters 50
-
-   For FP8, use the following command.
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
-      bash examples/run_pretrain.sh \
-          --train_iters 50 \
-          --fp8 hybrid
-
-.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
-
-   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
-      bash examples/run_pretrain.sh --train_iters 50
-
-Multi-node training examples
----------------------------
-
-To run training on multiple nodes, you can use the
-`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
-to launch the multi-node workload. Use the following steps to setup your environment:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-   {% set docker = dockers[0] %}
-
-   .. code-block:: shell
-
-      cd /workspace/Primus/
-      export DOCKER_IMAGE={{ docker.pull_tag }}
-      export HF_TOKEN=<your_HF_token>
-      export HSA_NO_SCRATCH_RECLAIM=1
-      export NVTE_CK_USES_BWD_V3=1
-      export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
-      export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
-      export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
-      export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
-
-.. note::
-
-   * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
-   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
-   * To find your network interface, you can use ``ip a``.
-   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
-
-   To train Llama 3.3 70B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 4 \
-          --global_batch_size 256 \
-          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid
-
-   To train Llama 3.3 70B BF16 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 1 \
-          --global_batch_size 256 \
-          --recompute_num_layers 12
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
-
-   To train Llama 3.1 8B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
-      NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-      bash ./examples/run_slurm_pretrain.sh \
-          --global_batch_size 1024 \
-          --fp8 hybrid
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
-
-   To train Llama 3.1 70B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 4 \
-          --global_batch_size 256 \
-          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid
-
-   To train Llama 3.1 70B BF16 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 1 \
-          --global_batch_size 256 \
-          --recompute_num_layers 12
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
-
-   To train Llama 2 8B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
-      NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
-
-   To train Llama 2 70B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 10 \
-          --global_batch_size 640 \
-          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid
-
-   To train Llama 2 70B BF16 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
-      bash ./examples/run_slurm_pretrain.sh \
-          --micro_batch_size 2 \
-          --global_batch_size 1536 \
-          --recompute_num_layers 12
-
-.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
-
-   To train Mixtral 8x7B BF16 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 2 \
-          --global_batch_size 256
-
-.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
-
-   To train Qwen2.5 72B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 8 \
-          --global_batch_size 512 \
-          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid
-
-.. _amd-primus-megatron-lm-benchmark-test-vars-v257:
-
-Key options
-----------
-
-The following are key options to take note of
-
-fp8
-  ``hybrid`` enables FP8 GEMMs.
-
-use_torch_fsdp2
-  ``use_torch_fsdp2: 1``  enables torch fsdp-v2. If FSDP is enabled,
-  set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
-
-profile
-  To enable PyTorch profiling, set these parameters:
-
-  .. code-block:: yaml
-
-     profile: true
-     use_pytorch_profiler: true
-     profile_step_end: 7
-     profile_step_start: 6
-
-train_iters
-  The total number of iterations (default: 50).
-
-mock_data
-  True by default.
-
-micro_batch_size
-  Micro batch size.
-
-global_batch_size
-  Global batch size.
-
-recompute_granularity
-  For activation checkpointing.
-
-num_layers
-  For using a reduced number of layers as with proxy models.
-
-Previous versions
-=================
-
-See :doc:`megatron-lm-history` to find documentation for previous releases
-of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -4,7 +4,7 @@
 PyTorch training performance testing version history
 ****************************************************

-This table lists previous versions of the ROCm PyTorch training Docker image for
+This table lists previous versions of the ROCm Megatron-LM training Docker image for
 inference performance testing. For detailed information about available models
 for benchmarking, see the version-specific documentation. You can find tagged
 previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/pytorch-training/tags>`_.
@@ -16,29 +16,12 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - Components
     - Resources

-   * - v25.8 (latest)
-     - 
-       * ROCm 6.4.3
-       * PyTorch 2.8.0a0+gitd06a406
-     - 
-       * :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
-       * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
-       * `Docker Hub <https://hub.docker.com/r/rocm/pytorch-training/tags>`__
-
-   * - v25.7
-     - 
-       * ROCm 6.4.2
-       * PyTorch 2.8.0a0+gitd06a406
-     - 
-       * :doc:`Documentation <pytorch-training-v25.7>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
-
   * - v25.6
     - 
       * ROCm 6.3.4
       * PyTorch 2.8.0a0+git7d205b2
     - 
-       * :doc:`Documentation <pytorch-training-v25.6>`
+       * :doc:`Documentation <../pytorch-training>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__

   * - v25.5
--- a/Show More
+++ b/Show More