Update RELEASE.md

update llvm-project link URL
Docs: deep learning table fix
2026-01-10 15:18:11 -05:00 · 2025-09-16 07:29:57 -07:00 · 2025-09-16 09:26:45 -04:00 · 2025-09-16 08:20:03 -04:00 · 2025-09-16 08:12:20 -04:00 · 2025-09-16 08:10:42 -04:00
87 changed files with 11862 additions and 3656 deletions
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: ROCR-Runtime
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -45,6 +64,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ROCR_Runtime_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -65,14 +88,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -82,105 +109,112 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ROCR_Runtime_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - task: Bash@3
-      displayName: Build kfdtest
-      inputs:
-        targetType: 'inline'
-        workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
-        script: |
-          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
-            source /opt/rh/gcc-toolset-14/enable
-          fi
-          mkdir build && cd build
-          cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm ..
-          make
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: kfdtest
-        testExecutable: BIN_DIR=$(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
-        testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
-        testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
-        os: ${{ job.os }}
-    - task: Bash@3
-      displayName: Build rocrtst
-      inputs:
-        targetType: 'inline'
-        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
-        script: |
-          echo $(Build.SourcesDirectory)/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-          sudo cat /etc/ld.so.conf.d/rocm-ci.conf
-          sudo ldconfig -v
-          ldconfig -p
-          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
-            source /opt/rh/gcc-toolset-14/enable
-          fi
-          BASE_CLANG_DIR=$(Agent.BuildDirectory)/rocm/llvm/lib/clang
-          export NEWEST_CLANG_VER=$(ls -1 $BASE_CLANG_DIR | sort -V | tail -n 1)
-          mkdir build && cd build
-          cmake .. \
-            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm \
-            -DTARGET_DEVICES=${{ job.target }} \
-            -DROCM_DIR=$(Agent.BuildDirectory)/rocm \
-            -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm/bin \
-            -DOPENCL_INC_DIR=$BASE_CLANG_DIR/$NEWEST_CLANG_VER/include
-          make
-          make rocrtst_kernels
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocrtst
-        testExecutable: ./rocrtst64
-        testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
-        testDir: $(Build.SourcesDirectory)/rocrtst/suites/test_common/build/${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
-  # docker image will be missing libhwloc5
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ROCR_Runtime_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - task: Bash@3
+        displayName: Build kfdtest
+        inputs:
+          targetType: 'inline'
+          workingDirectory: $(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest
+          script: |
+            if [ -e /opt/rh/gcc-toolset-14/enable ]; then
+              source /opt/rh/gcc-toolset-14/enable
+            fi
+            mkdir build && cd build
+            cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm ..
+            make
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: kfdtest
+          testExecutable: BIN_DIR=$(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
+          testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
+          testDir: $(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest/scripts
+          os: ${{ job.os }}
+      - task: Bash@3
+        displayName: Build rocrtst
+        inputs:
+          targetType: 'inline'
+          workingDirectory: $(Agent.BuildDirectory)/s/rocrtst/suites/test_common
+          script: |
+            echo $(Agent.BuildDirectory)/s/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+            sudo cat /etc/ld.so.conf.d/rocm-ci.conf
+            sudo ldconfig -v
+            ldconfig -p
+            if [ -e /opt/rh/gcc-toolset-14/enable ]; then
+              source /opt/rh/gcc-toolset-14/enable
+            fi
+            BASE_CLANG_DIR=$(Agent.BuildDirectory)/rocm/llvm/lib/clang
+            export NEWEST_CLANG_VER=$(ls -1 $BASE_CLANG_DIR | sort -V | tail -n 1)
+            mkdir build && cd build
+            cmake .. \
+              -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm \
+              -DTARGET_DEVICES=${{ job.target }} \
+              -DROCM_DIR=$(Agent.BuildDirectory)/rocm \
+              -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm/bin \
+              -DOPENCL_INC_DIR=$BASE_CLANG_DIR/$NEWEST_CLANG_VER/include
+            make
+            make rocrtst_kernels
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: rocrtst
+          testExecutable: ./rocrtst64
+          testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
+          testDir: $(Agent.BuildDirectory)/s//rocrtst/suites/test_common/build/${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+    # docker image will be missing libhwloc5
--- a/.azuredevops/components/hip-tests.yml
+++ b/.azuredevops/components/hip-tests.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hip-tests
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -60,6 +79,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: hip_tests_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -76,15 +99,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    # compile hip-tests
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: hip-tests
+        componentName: ${{ parameters.componentName }}
        cmakeSourceDir: '../catch'
        customBuildTarget: build_tests
        extraBuildFlags: >-
@@ -96,9 +122,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -108,52 +137,56 @@ jobs:
        extraEnvVars:
          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hip_tests_test_${{ job.target }}
-    timeoutInMinutes: 240
-    dependsOn: hip_tests_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - task: Bash@3
-      displayName: Symlink rocm_agent_enumerator
-      inputs:
-        targetType: inline
-        script: |
-          # Assuming that /opt is no longer persistent across runs, test environments are fully ephemeral
-          sudo mkdir -p /opt/rocm/bin
-          sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rocm_agent_enumerator /opt/rocm/bin/rocm_agent_enumerator
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hip_tests
-        testDir: $(Agent.BuildDirectory)/rocm/share/hip
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
-        optSymLink: true
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: hip_tests_test_${{ job.target }}
+      timeoutInMinutes: 240
+      dependsOn: hip_tests_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: Bash@3
+        displayName: Symlink rocm_agent_enumerator
+        inputs:
+          targetType: inline
+          script: |
+            # Assuming that /opt is no longer persistent across runs, test environments are fully ephemeral
+            sudo mkdir -p /opt/rocm/bin
+            sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rocm_agent_enumerator /opt/rocm/bin/rocm_agent_enumerator
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: $(Agent.BuildDirectory)/rocm/share/hip
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+          optSymLink: true
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -178,7 +178,7 @@ jobs:
          mkdir -p $(Agent.BuildDirectory)/temp-deps
          cd $(Agent.BuildDirectory)/temp-deps
          # position-independent LAPACK is required for almalinux8 builds
-          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
+          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/sparse/projects/hipblaslt/deps
          make -j
          sudo make install
    - script: |
@@ -197,6 +197,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
+        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt
+        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt/build
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -44,6 +44,7 @@ parameters:
  type: object
  default:
    - joblib
+    - msgpack
 - name: rocmDependencies
  type: object
  default:
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -0,0 +1,236 @@
+parameters:
+- name: componentName
+  type: string
+  default: origami
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
+# set to true if doing full build of ROCm stack
+# and dependencies are pulled from same pipeline
+- name: aggregatePipeline
+  type: boolean
+  default: false
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - wget
+    - python3
+    - python3-dev
+    - python3-pip
+- name: pipModules
+  type: object
+  default:
+    - nanobind>=2.0.0
+- name: rocmDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+- name: rocmTestDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+    testJobs:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipBLASLt:
+      name: hipBLASLt
+      sparseCheckoutDir: projects/hipblaslt
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - origami_build
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: origami_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -DORIGAMI_BUILD_SHARED_LIBS=ON
+          -DORIGAMI_ENABLE_PYTHON=ON
+          -DORIGAMI_BUILD_TESTING=ON
+          -GNinja
+    - ${{ if ne(job.os, 'almalinux8') }}:
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Build Directory Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/build'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          publishLocation: 'pipeline'
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Python Source Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/python'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          publishLocation: 'pipeline'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: origami_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: origami_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Build Directory Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          path: '$(Agent.BuildDirectory)/s/build'
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Python Source Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          path: '$(Agent.BuildDirectory)/s/python'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - script: |
+          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
+
+          echo "--- Running origami_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
+          
+          echo "--- Running origami_grid_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
+        displayName: 'Run Python Binding Tests'
+        condition: succeeded()
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -179,6 +179,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
+        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/rocblas
+        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/rocblas/build
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_BUILD_TYPE=Release
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -8,6 +8,25 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+- name: rocPyDecodeRepo
+  type: string
+  default: rocpydecode_repo
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -56,10 +75,23 @@ parameters:
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - rocPyDecode:
+      name: rocPyDecode
+      sparseCheckoutDir: ''
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocDecode_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -83,12 +115,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -169,3 +204,15 @@ jobs:
        registerROCmPackages: true
        environment: test
        gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.rocPyDecodeRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -5,6 +5,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -47,19 +63,19 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocPyDecode_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -74,16 +90,20 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: 'Save Python Package Paths'
      inputs:
--- a/.azuredevops/components/rocm-core.yml
+++ b/.azuredevops/components/rocm-core.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocm-core
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -27,6 +46,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocm_core_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+        dependsOn:
+          - ${{ each build in parameters.buildDependsOn }}:
+            - ${{ build }}_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -50,8 +73,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -65,9 +90,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocm_smi_lib
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -32,6 +51,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocm_smi_lib_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -55,8 +78,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -65,51 +90,56 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
-    dependsOn: rocm_smi_lib_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocm_smi_lib
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
+      dependsOn: rocm_smi_lib_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocminfo
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -40,7 +59,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocminfo_build_${{ job.os }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -62,14 +85,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -78,65 +105,71 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocminfo_test_${{ job.target }}
-    dependsOn: rocminfo_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocminfo
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/bin/rocminfo'
-        testParameters: ''
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocm_agent_enumerator
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/bin/rocm_agent_enumerator'
-        testParameters: ''
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        registerROCmPackages: true
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocminfo_test_${{ job.target }}
+      dependsOn: rocminfo_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+          registerROCmPackages: true
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: './rocm/bin/rocminfo'
+          testParameters: ''
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: rocm_agent_enumerator
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: './rocm/bin/rocm_agent_enumerator'
+          testParameters: ''
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          registerROCmPackages: true
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocprofiler-compute
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -36,6 +55,7 @@ parameters:
    - pymongo
    - pyyaml
    - setuptools
+    - sqlalchemy
    - tabulate
    - textual
    - textual_plotext
@@ -78,6 +98,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocprofiler_compute_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -94,15 +118,19 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -111,78 +139,83 @@ jobs:
    #     pipModules: ${{ parameters.pipModules }}
    #     gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocprofiler_compute_test_${{ job.target }}
-    timeoutInMinutes: 120
-    dependsOn: rocprofiler_compute_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: PYTHON_VERSION
-      value: 3.10
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - task: Bash@3
-      displayName: Add en_US.UTF-8 locale
-      inputs:
-        targetType: inline
-        script: |
-          sudo locale-gen en_US.UTF-8
-          sudo update-locale
-          locale -a
-    - task: Bash@3
-      displayName: Add ROCm binaries to PATH
-      inputs:
-        targetType: inline
-        script: |
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        extraBuildFlags: >-
-          -DCMAKE_HIP_ARCHITECTURES=${{ job.target }}
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_BUILD_TYPE=Release
-          -DENABLE_TESTS=ON
-          -DINSTALL_TESTS=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofiler-compute
-        testDir: $(Build.BinariesDirectory)/libexec/rocprofiler-compute
-        testExecutable: ROCM_PATH=$(Agent.BuildDirectory)/rocm ctest
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocprofiler_compute_test_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: rocprofiler_compute_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      - name: PYTHON_VERSION
+        value: 3.10
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: Bash@3
+        displayName: Add en_US.UTF-8 locale
+        inputs:
+          targetType: inline
+          script: |
+            sudo locale-gen en_US.UTF-8
+            sudo update-locale
+            locale -a
+      - task: Bash@3
+        displayName: Add ROCm binaries to PATH
+        inputs:
+          targetType: inline
+          script: |
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        parameters:
+          extraBuildFlags: >-
+            -DCMAKE_HIP_ARCHITECTURES=${{ job.target }}
+            -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
+            -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
+            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+            -DROCM_PATH=$(Agent.BuildDirectory)/rocm
+            -DCMAKE_BUILD_TYPE=Release
+            -DENABLE_TESTS=ON
+            -DINSTALL_TESTS=ON
+            -GNinja
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: $(Build.BinariesDirectory)/libexec/rocprofiler-compute
+          testExecutable: ROCM_PATH=$(Agent.BuildDirectory)/rocm ctest
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocprofiler-sdk
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -73,6 +92,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocprofiler_sdk_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -89,6 +112,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -96,6 +120,8 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Add Python site-packages binaries to path
      inputs:
@@ -105,6 +131,7 @@ jobs:
          echo "##vso[task.prependpath]$USER_BASE/bin"
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DROCPROFILER_BUILD_TESTS=ON
@@ -114,9 +141,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -126,62 +156,67 @@ jobs:
    #     gpuTarget: ${{ job.target }}
    #     registerROCmPackages: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocprofiler_sdk_test_${{ job.target }}
-    dependsOn: rocprofiler_sdk_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        gpuTarget: ${{ job.target }}
-    - task: Bash@3
-      displayName: Add Python and ROCm binaries to path
-      inputs:
-        targetType: inline
-        script: |
-          USER_BASE=$(python3 -m site --user-base)
-          echo "##vso[task.prependpath]$USER_BASE/bin"
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DROCPROFILER_BUILD_TESTS=ON
-          -DROCPROFILER_BUILD_SAMPLES=ON
-          -DROCPROFILER_BUILD_RELEASE=ON
-          -DGPU_TARGETS=${{ job.target }}
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH}}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofiler-sdk
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
-        registerROCmPackages: true
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocprofiler_sdk_test_${{ job.target }}
+      dependsOn: rocprofiler_sdk_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          registerROCmPackages: true
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: Bash@3
+        displayName: Add Python and ROCm binaries to path
+        inputs:
+          targetType: inline
+          script: |
+            USER_BASE=$(python3 -m site --user-base)
+            echo "##vso[task.prependpath]$USER_BASE/bin"
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          extraBuildFlags: >-
+            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+            -DROCPROFILER_BUILD_TESTS=ON
+            -DROCPROFILER_BUILD_SAMPLES=ON
+            -DROCPROFILER_BUILD_RELEASE=ON
+            -DGPU_TARGETS=${{ job.target }}
+            -GNinja
+      - template: ${{ variables.CI_TEMPLATE_PATH}}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+          registerROCmPackages: true
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -6,6 +6,25 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: componentName
+  type: string
+  default: rocprofiler-systems
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -87,6 +106,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocprofiler_systems_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+        dependsOn:
+          - ${{ each build in parameters.buildDependsOn }}:
+            - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -105,6 +128,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -136,12 +160,16 @@ jobs:
          -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
          -DGPU_TARGETS=${{ job.target }}
          -GNinja
+        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
@@ -151,85 +179,88 @@ jobs:
        registerROCmPackages: true
        extraPaths: /home/user/workspace/rocm/bin:/home/user/workspace/rocm/llvm/bin

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocprofiler_systems_test_${{ job.target }}
-    dependsOn: rocprofiler_systems_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    timeoutInMinutes: 180
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: ROCM_PATH
-      value: $(Agent.BuildDirectory)/rocm
-    pool:
-      name: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        gpuTarget: ${{ job.target }}
-    - task: Bash@3
-      displayName: Add ROCm binaries to PATH
-      inputs:
-        targetType: inline
-        script: |
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-  # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
-        extraBuildFlags: >-
-          -DROCPROFSYS_BUILD_TESTING=ON
-          -DROCPROFSYS_BUILD_DYNINST=ON
-          -DROCPROFSYS_BUILD_LIBUNWIND=ON
-          -DROCPROFSYS_DISABLE_EXAMPLES="openmp-target"
-          -DDYNINST_BUILD_TBB=ON
-          -DDYNINST_BUILD_ELFUTILS=ON
-          -DDYNINST_BUILD_LIBIBERTY=ON
-          -DDYNINST_BUILD_BOOST=ON
-          -DROCPROFSYS_USE_PAPI=ON
-          -DROCPROFSYS_USE_MPI=ON
-          -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
-          -DGPU_TARGETS=${{ job.target }}
-          -GNinja
-    - task: Bash@3
-      displayName: Set up rocprofiler-systems env
-      inputs:
-        targetType: inline
-        script: source share/rocprofiler-systems/setup-env.sh
-        workingDirectory: build
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofiler-systems
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        registerROCmPackages: true
-        gpuTarget: ${{ job.target }}
-        extraPaths: /home/user/workspace/rocm/bin:/home/user/workspace/rocm/llvm/bin
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocprofiler_systems_test_${{ job.target }}
+      dependsOn: rocprofiler_systems_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      timeoutInMinutes: 180
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      - name: ROCM_PATH
+        value: $(Agent.BuildDirectory)/rocm
+      pool:
+        name: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          registerROCmPackages: true
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: Bash@3
+        displayName: Add ROCm binaries to PATH
+        inputs:
+          targetType: inline
+          script: |
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        parameters:
+    # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
+          extraBuildFlags: >-
+            -DROCPROFSYS_BUILD_TESTING=ON
+            -DROCPROFSYS_BUILD_DYNINST=ON
+            -DROCPROFSYS_BUILD_LIBUNWIND=ON
+            -DROCPROFSYS_DISABLE_EXAMPLES="openmp-target"
+            -DDYNINST_BUILD_TBB=ON
+            -DDYNINST_BUILD_ELFUTILS=ON
+            -DDYNINST_BUILD_LIBIBERTY=ON
+            -DDYNINST_BUILD_BOOST=ON
+            -DROCPROFSYS_USE_PAPI=ON
+            -DROCPROFSYS_USE_MPI=ON
+            -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
+            -DGPU_TARGETS=${{ job.target }}
+            -GNinja
+      - task: Bash@3
+        displayName: Set up rocprofiler-systems env
+        inputs:
+          targetType: inline
+          script: source share/rocprofiler-systems/setup-env.sh
+          workingDirectory: build
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        parameters:
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+        parameters:
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          registerROCmPackages: true
+          gpuTarget: ${{ job.target }}
+          extraPaths: /home/user/workspace/rocm/bin:/home/user/workspace/rocm/llvm/bin
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -8,6 +8,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -65,6 +81,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -87,6 +107,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -94,6 +115,8 @@ jobs:
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    # the linker flags will not affect ubuntu2204 builds as the paths do not exist
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
@@ -109,10 +132,13 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -123,53 +149,57 @@ jobs:
    #     gpuTarget: ${{ job.target }}
    #     registerROCmPackages: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: roctracer
-        testExecutable: $(Agent.BuildDirectory)/rocm/share/roctracer/run_tests.sh
-        testParameters: ''
-        testDir: $(Agent.BuildDirectory)
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
-        registerROCmPackages: true
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
+          registerROCmPackages: true
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testExecutable: $(Agent.BuildDirectory)/rocm/share/roctracer/run_tests.sh
+          testParameters: ''
+          testDir: $(Agent.BuildDirectory)
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+          registerROCmPackages: true
--- a/.azuredevops/dependencies/catch2.yml
+++ b/.azuredevops/dependencies/catch2.yml
@@ -0,0 +1,63 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: catch2Version
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: catch2_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone catch2 ${{ parameters.catch2Version }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/catchorg/Catch2.git -b ${{ parameters.catch2Version }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/Catch2/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/Catch2
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/dependencies/fmtlib.yml
+++ b/.azuredevops/dependencies/fmtlib.yml
@@ -0,0 +1,67 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: fmtlibVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - libfmt-dev
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: fmtlib_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone fmtlib ${{ parameters.fmtlibVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/fmtlib/fmt.git -b ${{ parameters.fmtlibVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/fmt/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/fmt
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DFMT_SYSTEM_HEADERS=ON
+          -DFMT_INSTALL=ON
+          -DFMT_TEST=OFF
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/dependencies/libdivide.yml
+++ b/.azuredevops/dependencies/libdivide.yml
@@ -0,0 +1,64 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: libdivideVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: libdivide_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone libdivide ${{ parameters.libdivideVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/ridiculousfish/libdivide.git -b ${{ parameters.libdivideVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/libdivide/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/libdivide
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DLIBDIVIDE_BUILD_TESTS=OFF
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/dependencies/spdlog.yml
+++ b/.azuredevops/dependencies/spdlog.yml
@@ -5,20 +5,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+- name: spdlogVersion
+  type: string
+  default: ''
 - name: aptPackages
  type: object
  default:
    - cmake
    - git
    - ninja-build
-    - libfmt-dev

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, spdlogVersion: "v1.9.2"}
-      - { os: almalinux8, packageManager: dnf, spdlogVersion: "v1.5.0"}
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
@@ -41,11 +43,15 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - fmtlib
    - task: Bash@3
-      displayName: Clone spdlog ${{ job.spdlogVersion }}
+      displayName: Clone spdlog ${{ parameters.spdlogVersion }}
      inputs:
        targetType: inline
-        script: git clone https://github.com/gabime/spdlog.git -b ${{ job.spdlogVersion }}
+        script: git clone https://github.com/gabime/spdlog.git -b ${{ parameters.spdlogVersion }}
        workingDirectory: $(Agent.BuildDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
@@ -54,6 +60,7 @@ jobs:
        cmakeSourceDir: $(Agent.BuildDirectory)/spdlog
        useAmdclang: false
        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/vendor
          -DCMAKE_BUILD_TYPE=Release
          -DSPDLOG_USE_STD_FORMAT=OFF
          -DSPDLOG_FMT_EXTERNAL_HO=ON
--- a/.azuredevops/tag-builds/catch2.yml
+++ b/.azuredevops/tag-builds/catch2.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: catch2Version
+  type: string
+  default: "v3.7.0"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/catch2.yml
+    parameters:
+      catch2Version: ${{ parameters.catch2Version }}
--- a/.azuredevops/tag-builds/fmtlib.yml
+++ b/.azuredevops/tag-builds/fmtlib.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: fmtlibVersion
+  type: string
+  default: "11.1.3"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/fmtlib.yml
+    parameters:
+      fmtlibVersion: ${{ parameters.fmtlibVersion }}
--- a/.azuredevops/tag-builds/libdivide.yml
+++ b/.azuredevops/tag-builds/libdivide.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: libdivideVersion
+  type: string
+  default: master
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/libdivide.yml
+    parameters:
+      libdivideVersion: ${{ parameters.libdivideVersion }}
--- a/.azuredevops/tag-builds/spdlog.yml
+++ b/.azuredevops/tag-builds/spdlog.yml
@@ -2,6 +2,11 @@ variables:
 - group: common
 - template: /.azuredevops/variables-global.yml

+parameters:
+- name: spdlogVersion
+  type: string
+  default: "v1.15.1"
+
 resources:
  repositories:
  - repository: pipelines_repo
@@ -14,3 +19,5 @@ pr: none

 jobs:
  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/spdlog.yml
+    parameters:
+      spdlogVersion: ${{ parameters.spdlogVersion }}
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -20,7 +20,7 @@ steps:
    retryCountOnTaskFailure: 3
    fetchFilter: blob:none
    ${{ if ne(parameters.sparseCheckoutDir, '') }}:
-      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
+      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }} shared
      path: sparse
  - ${{ if ne(parameters.sparseCheckoutDir, '') }}:
    - task: Bash@3
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -63,8 +63,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: false
    hip-tests:
-      pipelineId: 233
-      developBranch: amd-staging
+      pipelineId: 362
+      developBranch: develop
      hasGpuTarget: false
    hipBLAS:
      pipelineId: 317
@@ -171,16 +171,16 @@ parameters:
      developBranch: develop
      hasGpuTarget: false
    rocm-core:
-      pipelineId: 103
-      developBranch: master
+      pipelineId: 349
+      developBranch: develop
      hasGpuTarget: false
    rocm-examples:
      pipelineId: 216
      developBranch: amd-staging
      hasGpuTarget: true
    rocminfo:
-      pipelineId: 91
-      developBranch: amd-staging
+      pipelineId: 356
+      developBranch: develop
      hasGpuTarget: false
    rocMLIR:
      pipelineId: 229
@@ -195,8 +195,8 @@ parameters:
      developBranch: master
      hasGpuTarget: false
    rocm_smi_lib:
-      pipelineId: 96
-      developBranch: amd-staging
+      pipelineId: 358
+      developBranch: develop
      hasGpuTarget: false
    rocPRIM:
      pipelineId: 273
@@ -207,7 +207,7 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    rocprofiler-compute:
-      pipelineId: 257
+      pipelineId: 344
      developBranch: develop
      hasGpuTarget: true
    rocprofiler-register:
@@ -215,8 +215,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: false
    rocprofiler-sdk:
-      pipelineId: 246
-      developBranch: amd-staging
+      pipelineId: 347
+      developBranch: develop
      hasGpuTarget: true
    rocprofiler-systems:
      pipelineId: 255
@@ -227,8 +227,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    ROCR-Runtime:
-      pipelineId: 10
-      developBranch: amd-staging
+      pipelineId: 354
+      developBranch: develop
      hasGpuTarget: false
    rocRAND:
      pipelineId: 274
@@ -251,8 +251,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    roctracer:
-      pipelineId: 141
-      developBranch: amd-staging
+      pipelineId: 331
+      developBranch: develop
      hasGpuTarget: true
    rocWMMA:
      pipelineId: 109
--- a/.azuredevops/templates/steps/dependencies-vendor.yml
+++ b/.azuredevops/templates/steps/dependencies-vendor.yml
@@ -8,10 +8,14 @@ parameters:
  type: object
  default:
    boost: 250
+    catch2: 343
+    fmtlib: 341
    grpc: 72
    gtest: 73
    half560: 68
    lapack: 69
+    libdivide: 342
+    spdlog: 340

 steps:
 - ${{ each dependency in parameters.dependencyList }}:
@@ -29,7 +33,7 @@ steps:
    inputs:
      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
      destinationFolder: $(Agent.BuildDirectory)/vendor
-      cleanDestinationFolder: true
+      cleanDestinationFolder: false
      overwriteExistingFiles: true
  - task: DeleteFiles@1
    displayName: Clean up ${{ dependency }}
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -62,6 +62,7 @@ CPU
 CPUs
 Cron
 CSC
+CSDATA
 CSE
 CSV
 CSn
@@ -81,6 +82,7 @@ CommonMark
 Concretized
 Conda
 ConnectX
+CountOnes
 CuPy
 da
 Dashboarding
@@ -97,6 +99,7 @@ DIMM
 DKMS
 DL
 DMA
+DOMContentLoaded
 DNN
 DNNL
 DPM
@@ -126,6 +129,7 @@ ESXi
 EoS
 fas
 FBGEMM
+FIFOs
 FFT
 FFTs
 FFmpeg
@@ -156,6 +160,7 @@ GEMMs
 GFLOPS
 GFortran
 GFXIP
+GGUF
 Gemma
 GiB
 GIM
@@ -200,6 +205,7 @@ Higgs
 href
 Hyperparameters
 Huggingface
+IB
 ICD
 ICT
 ICV
@@ -208,8 +214,11 @@ IDEs
 IFWI
 IMDb
 IncDec
+instrSize
+interpolators
 IOMMU
 IOP
+IOPS
 IOPM
 IOV
 IRQ
@@ -246,12 +255,15 @@ LLM
 LLMs
 LLVM
 LM
+LRU
 LSAN
 LSan
 LTS
 LSTMs
+LteAll
 LanguageCrossEntropy
 LoRA
+MECO
 MEM
 MERCHANTABILITY
 MFMA
@@ -270,6 +282,7 @@ MNIST
 MPI
 MPT
 MSVC
+mul
 MVAPICH
 MVFFR
 Makefile
@@ -293,6 +306,7 @@ Multicore
 Multithreaded
 MyEnvironment
 MyST
+NANOO
 NBIO
 NBIOs
 NCCL
@@ -347,6 +361,7 @@ PCC
 PCI
 PCIe
 PEFT
+perf
 PEQT
 PIL
 PILImage
@@ -431,6 +446,7 @@ SKUs
 SLES
 SLURM
 SMEM
+SMFMA
 SMI
 SMT
 SPI
@@ -442,18 +458,23 @@ SWE
 SerDes
 ShareGPT
 Shlens
+simd
 Skylake
 Softmax
 Spack
 SplitK
 Supermicro
 Szegedy
+TagRAM
 TCA
 TCC
+TCCs
 TCI
 TCIU
 TCP
 TCR
+THREADGROUPS
+threadgroups
 TensorRT
 TensorFloat
 TF
@@ -497,9 +518,11 @@ UltraChat
 Uncached
 Unittests
 Unhandled
+unwindowed
 VALU
 VBIOS
 VCN
+verl's
 VGPR
 VGPRs
 VM
@@ -512,11 +535,13 @@ Vanhoucke
 Vulkan
 WGP
 WGPs
+WR
 WX
 WikiText
 Wojna
 Workgroups
 Writebacks
+xcc
 XCD
 XCDs
 XGBoost
@@ -537,6 +562,7 @@ ZenDNN
 accuracies
 activations
 addr
+addEventListener
 ade
 ai
 alloc
@@ -552,6 +578,7 @@ autogenerated
 autotune
 avx
 awk
+az
 backend
 backends
 bb
@@ -569,6 +596,7 @@ boson
 bosons
 br
 BrainFloat
+btn
 buildable
 bursty
 bzip
@@ -580,6 +608,7 @@ centric
 changelog
 checkpointing
 chiplet
+classList
 cmake
 cmd
 coalescable
@@ -592,6 +621,7 @@ concretization
 config
 configs
 conformant
+const
 constructible
 convolutional
 convolves
@@ -655,6 +685,7 @@ exascale
 executables
 ffmpeg
 filesystem
+forEach
 fortran
 fp
 framebuffer
@@ -663,6 +694,7 @@ galb
 gcc
 gdb
 gemm
+getAttribute
 gfortran
 gfx
 githooks
@@ -670,6 +702,7 @@ github
 globals
 gnupg
 grayscale
+gx
 gzip
 heterogenous
 hipBLAS
@@ -742,6 +775,7 @@ logits
 lossy
 macOS
 matchers
+maxtext
 megatron
 microarchitecture
 migraphx
@@ -770,6 +804,7 @@ opencv
 openmp
 openssl
 optimizers
+ol
 os
 oversubscription
 pageable
@@ -779,6 +814,7 @@ parallelizing
 param
 parameterization
 passthrough
+pe
 perfcounter
 performant
 perl
@@ -808,6 +844,7 @@ profiler
 profilers
 protobuf
 pseudorandom
+px
 py
 pytorch
 recommender
@@ -815,6 +852,8 @@ recommenders
 quantile
 quantizer
 quasirandom
+querySelector
+querySelectorAll
 queueing
 qwen
 radeon
@@ -833,6 +872,8 @@ req
 resampling
 rescaling
 reusability
+rhel
+rl
 RLHF
 roadmap
 roc
@@ -877,13 +918,16 @@ scalability
 scalable
 scipy
 seealso
+selectedTag
 sendmsg
 seqs
 serializers
+setAttribute
 sglang
 shader
 sharding
 sigmoid
+sles
 sm
 smi
 softmax
@@ -906,6 +950,7 @@ symlink
 symlinks
 sys
 tabindex
+targetContainer
 td
 tensorfloat
 th
@@ -918,6 +963,7 @@ toolchain
 toolchains
 toolset
 toolsets
+torchtitan
 torchvision
 tqdm
 tracebacks
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/RELEASE.md
+++ b/RELEASE.md
--- a/default.xml
+++ b/default.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.3"
+    <default revision="refs/tags/rocm-7.0.0"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
@@ -9,6 +9,7 @@
    <project name="ROCK-Kernel-Driver" />
    <project name="ROCR-Runtime" />
    <project name="amdsmi" />
+    <project name="aqlprofile" />
    <project name="rdc" />
    <project name="rocm_bandwidth_test" />
    <project name="rocm_smi_lib" />
@@ -22,7 +23,7 @@
    <project name="rocprofiler-systems" />
    <project name="roctracer" />
 <!--HIP Projects-->
-    <project name="HIP" />
+    <project name="hip" />
    <project name="hip-tests" />
    <project name="HIPIFY" />
    <project name="clr" />
@@ -37,36 +38,26 @@
    <project name="rocr_debug_agent" />
 <!-- ROCm Libraries -->
    <project groups="mathlibs" name="AMDMIGraphX" />
-    <project groups="mathlibs" name="MIOpen" />
    <project groups="mathlibs" name="MIVisionX" />
    <project groups="mathlibs" name="ROCmValidationSuite" />
-    <project groups="mathlibs" name="Tensile" />
    <project groups="mathlibs" name="composable_kernel" />
-    <project groups="mathlibs" name="hipBLAS-common" />
-    <project groups="mathlibs" name="hipBLAS" />
-    <project groups="mathlibs" name="hipBLASLt" />
-    <project groups="mathlibs" name="hipCUB" />
-    <project groups="mathlibs" name="hipFFT" />
-    <project groups="mathlibs" name="hipRAND" />
    <project groups="mathlibs" name="hipSOLVER" />
-    <project groups="mathlibs" name="hipSPARSE" />
-    <project groups="mathlibs" name="hipSPARSELt" />
    <project groups="mathlibs" name="hipTensor" />
    <project groups="mathlibs" name="hipfort" />
    <project groups="mathlibs" name="rccl" />
    <project groups="mathlibs" name="rocAL" />
    <project groups="mathlibs" name="rocALUTION" />
-    <project groups="mathlibs" name="rocBLAS" />
    <project groups="mathlibs" name="rocDecode" />
    <project groups="mathlibs" name="rocJPEG" />
+    <!-- The following components have been migrated to rocm-libraries:
+        hipBLAS-common hipBLAS hipBLASLt hipCUB
+        hipFFT hipRAND hipSPARSE hipSPARSELt
+        MIOpen rocBLAS rocFFT rocPRIM rocRAND
+        rocSPARSE rocThrust Tensile -->
+    <project groups="mathlibs" name="rocm-libraries" />
    <project groups="mathlibs" name="rocPyDecode" />
-    <project groups="mathlibs" name="rocFFT" />
-    <project groups="mathlibs" name="rocPRIM" />
-    <project groups="mathlibs" name="rocRAND" />
    <project groups="mathlibs" name="rocSHMEM" />
    <project groups="mathlibs" name="rocSOLVER" />
-    <project groups="mathlibs" name="rocSPARSE" />
-    <project groups="mathlibs" name="rocThrust" />
    <project groups="mathlibs" name="rocWMMA" />
    <project groups="mathlibs" name="rocm-cmake" />
    <project groups="mathlibs" name="rpp" />
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -29,7 +29,7 @@ additional licenses. Please review individual repositories for more information.
 | [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
 | [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
-| [AQLprofile] | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE) |
+| [AQLprofile](https://github.com/rocm/aqlprofile/) | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
 | [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
 | [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
 | [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
@@ -50,7 +50,7 @@ additional licenses. Please review individual repositories for more information.
 | [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
 | [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
-| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
+| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
 | [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
 | [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
 | [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
@@ -67,15 +67,15 @@ additional licenses. Please review individual repositories for more information.
 | [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
 | [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
 | [ROCm Compute Profiler](https://github.com/ROCm/rocprofiler-compute) | [MIT](https://github.com/ROCm/rocprofiler-compute/blob/amd-staging/LICENSE) |
-| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE) |
+| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE.md) |
 | [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
 | [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/amd-staging/opencl) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/opencl/LICENSE.txt) |
 | [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
-| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/License.txt) |
-| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE) |
+| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/LICENSE.md) |
+| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE.md) |
 | [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
 | [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
-| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE) |
+| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE.md) |
 | [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
 | [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
 | [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -1,131 +1,134 @@
-ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
-      ,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
-      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
-      ,"Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
-      ,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
-      ,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
-      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
-      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
-      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
-      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
-      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
-      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
-      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
-      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
-,,,,,,,,,,,,,,,,,,
-      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
-      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
-,,,,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,,,,
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
-      ,,,,,,,,,,,,,,,,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      Thrust,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      CUB,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-,,,,,,,,,,,,,,,,,,
-      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
-      ,,,,,,,,,,,,,,,,,,
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
-      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
-      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
-      ,,,,,,,,,,,,,,,,,,
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-      :doc:`rocSHMEM <rocshmem:index>`,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      ,,,,,,,,,,,,,,,,,,
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
-      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
-      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
-      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
-      ,,,,,,,,,,,,,,,,,,
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      ,,,,,,,,,,,,,,,,,,
-      SUPPORT LIBS,,,,,,,,,,,,,,,,,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
-      ,,,,,,,,,,,,,,,,,,
-      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
-      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
-      ,,,,,,,,,,,,,,,,,,
-      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
-      ,,,,,,,,,,,,,,,,,,
-      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,
-      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
-      ,,,,,,,,,,,,,,,,,,
-      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
-      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-,,,,,,,,,,,,,,,,,,
-      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      :doc:`HIP <hip:index>`,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
+ROCm Version,7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
+      ,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,"Oracle Linux 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
+      ,Debian 12,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
+      ,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,,,,,,,,,,,,
+,Rocky Linux 9,,,,,,,,,,,,,,,,,,
+      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,,,,,,,,,,,,,,,,,,
+,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950,,,,,,,,,,,,,,,,,,
+,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+,,,,,,,,,,,,,,,,,,,
+      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.7, 2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,N/A,85f95ae,85f95ae,85f95ae,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
+      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,N/A,N/A,N/A,0.7.0,0.7.0,0.7.0,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
+      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+,,,,,,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,,,,
+      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      ,,,,,,,,,,,,,,,,,,,
+      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      Thrust,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      CUB,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+,,,,,,,,,,,,,,,,,,,
+      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      ,,,,,,,,,,,,,,,,,,,
+      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      :doc:`rocAL <rocal:index>`,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`rocDecode <rocdecode:index>`,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`RPP <rpp:index>`,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      ,,,,,,,,,,,,,,,,,,,
+      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`RCCL <rccl:index>`,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      ,,,,,,,,,,,,,,,,,,,
+      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      :doc:`hipfort <hipfort:index>`,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`rocRAND <rocrand:index>`,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`Tensile <tensile:src/index>`,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      ,,,,,,,,,,,,,,,,,,,
+      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`hipCUB <hipcub:index>`,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,,,,,,,,,,,,,,
+      SUPPORT LIBS,,,,,,,,,,,,,,,,,,,
+      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      ,,,,,,,,,,,,,,,,,,,
+      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`AMD SMI <amdsmi:index>`,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      ,,,,,,,,,,,,,,,,,,,
+      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      ,,,,,,,,,,,,,,,,,,,
+      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,
+      :doc:`HIPIFY <hipify:index>`,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      ,,,,,,,,,,,,,,,,,,,
+      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`llvm-project <llvm-project:index>`,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+,,,,,,,,,,,,,,,,,,,
+      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`HIP <hip:index>`,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -23,26 +23,29 @@ compatibility and system requirements.
 .. container:: format-big-table

  .. csv-table::
-      :header: "ROCm Version", "6.4.3", "6.4.2", "6.3.0"
+      :header: "ROCm Version", "7.0.0", "6.4.3", "6.3.0"
      :stub-columns: 1

-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2
      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.5, 9.4"
-      ,RHEL 8.10,RHEL 8.10,RHEL 8.10
-      ,"SLES 15 SP7, SP6","SLES 15 SP7, SP6","SLES 15 SP6, SP5"
-      ,"Oracle Linux 9, 8 [#mi300x]_","Oracle Linux 9, 8 [#mi300x]_",Oracle Linux 8.10 [#mi300x]_
-      ,Debian 12 [#single-node]_,Debian 12 [#single-node]_,
-      ,Azure Linux 3.0 [#mi300x]_,Azure Linux 3.0 [#mi300x]_,
+      ,RHEL 8.10 [#rhel-700]_,RHEL 8.10 [#rhel-700],RHEL 8.10 [#rhel-700]
+      ,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP6, SP5"
+      ,"Oracle Linux 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-mi300x]_",Oracle Linux 8.10 [#ol-mi300x]_
+      ,Debian 12,Debian 12 [#single-node]_,
+      ,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_,
+      ,Rocky Linux 9 [#rl-700]_,,
      ,.. _architecture-support-compatibility-matrix:,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,,
+      ,CDNA3,CDNA3,CDNA3
      ,CDNA2,CDNA2,CDNA2
      ,CDNA,CDNA,CDNA
      ,RDNA4,RDNA4,
      ,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix:,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS]_,gfx1201 [#RDNA-OS]_,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950,,
+      ,gfx1201 [#RDNA-OS]_,gfx1201 [#RDNA-OS]_,
      ,gfx1200 [#RDNA-OS]_,gfx1200 [#RDNA-OS]_,
      ,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,
      ,gfx1100,gfx1100,gfx1100
@@ -52,113 +55,120 @@ compatibility and system requirements.
      ,gfx908,gfx908,gfx908
      ,,,
      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.7, 2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.4.35,0.4.31
+      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A
      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,85f95ae
+      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A
      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,0.7.0
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3
+      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.20.0,1.17.3
      ,,,
      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.3.0,>=1.3.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.15.0,>=1.15.0
      ,,,
      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
-      Thrust,2.5.0,2.5.0,2.3.2
-      CUB,2.5.0,2.5.0,2.3.2
+      Thrust,2.6.0,2.5.0,2.3.2
+      CUB,2.6.0,2.5.0,2.3.2
      ,,,
      KMD & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
-      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
      ,,,
      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.11.0
-      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.3.0
-      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.1.0
-      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.1.0
-      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.8.0
-      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.6.0
-      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.2.0
-      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.1
+      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.12.0,2.11.0
+      :doc:`MIOpen <miopen:index>`,3.5.0,3.4.0,3.3.0
+      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.2.0,3.1.0
+      :doc:`rocAL <rocal:index>`,2.3.0,2.2.0,2.1.0
+      :doc:`rocDecode <rocdecode:index>`,1.0.0,0.10.0,0.8.0
+      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,0.8.0,0.6.0
+      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.3.1,0.2.0
+      :doc:`RPP <rpp:index>`,2.0.0,1.9.10,1.9.1
      ,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
-      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5
-      :doc:`rocSHMEM <rocshmem:index>`,2.0.1,2.0.1,N/A
+      :doc:`RCCL <rccl:index>`,2.26.6,2.22.3,2.21.5
+      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,2.0.1,N/A
      ,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.3.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.1,0.10.0
-      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.17
-      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.5.0
-      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.11.0
-      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.3.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.1.2
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.2
-      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.3,3.2.1
-      :doc:`rocBLAS <rocblas:index>`,4.4.1,4.4.1,4.3.0
-      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.31
-      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.2.0
-      :doc:`rocSOLVER <rocsolver:index>`,3.28.2,3.28.2,3.27.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.3.0
-      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.6.0
-      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.42.0
+      :doc:`hipBLAS <hipblas:index>`,3.0.0,2.4.0,2.3.0
+      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,0.12.1,0.10.0
+      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.18,1.0.17
+      :doc:`hipfort <hipfort:index>`,0.7.0,0.6.0,0.5.0
+      :doc:`hipRAND <hiprand:index>`,3.0.0,2.12.0,2.11.0
+      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,2.4.0,2.3.0
+      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,3.2.0,3.1.2
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.3,0.2.2
+      :doc:`rocALUTION <rocalution:index>`,4.0.0,3.2.3,3.2.1
+      :doc:`rocBLAS <rocblas:index>`,5.0.0,4.4.1,4.3.0
+      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.32,1.0.31
+      :doc:`rocRAND <rocrand:index>`,4.0.0,3.3.0,3.2.0
+      :doc:`rocSOLVER <rocsolver:index>`,3.30.0,3.28.2,3.27.0
+      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,3.4.0,3.3.0
+      :doc:`rocWMMA <rocwmma:index>`,2.0.0,1.7.0,1.6.0
+      :doc:`Tensile <tensile:src/index>`,4.44.0,4.43.0,4.42.0
      ,,,
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
-      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.3.0
-      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.4.0
-      :doc:`rocPRIM <rocprim:index>`,3.4.1,3.4.1,3.3.0
-      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0
+      :doc:`hipCUB <hipcub:index>`,4.0.0,3.4.0,3.3.0
+      :doc:`hipTensor <hiptensor:index>`,2.0.0,1.5.0,1.4.0
+      :doc:`rocPRIM <rocprim:index>`,4.0.0,3.4.1,3.3.0
+      :doc:`rocThrust <rocthrust:index>`,4.0.0,3.3.0,3.3.0
      ,,,
      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43483,6.3.42131
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.3,6.4.2,6.3.0
+      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,6.4.43483,6.3.42131
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.0,6.4.3,6.3.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,25.5.1,25.5.1,24.7.1
-      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0
+      :doc:`AMD SMI <amdsmi:index>`,26.0.0,25.5.1,24.7.1
+      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,0.3.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.7.0,7.5.0,7.4.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.7.0,7.4.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.1.0,1.1.0
      ,,,
      PERFORMANCE TOOLS,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.1,3.1.1,3.0.0
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.2,1.0.2,0.1.0
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60403,2.0.60402,2.0.60300
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.5.0
-      :doc:`ROCTracer <roctracer:index>`,4.1.60403,4.1.60402,4.1.60300
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.1.1,3.0.0
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.0,1.0.2,0.1.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70000,2.0.60403,2.0.60300
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,0.6.0,0.5.0
+      :doc:`ROCTracer <roctracer:index>`,4.1.70000,4.1.60403,4.1.60300
      ,,,
      DEVELOPMENT TOOLS,,,
-      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,18.0.0.24455
+      :doc:`HIPIFY <hipify:index>`,20.0.0,19.0.0,18.0.0.24455
      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.3
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.3,0.77.2,0.77.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,15.2.0,15.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.4.0,0.4.0
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.0.4,2.0.3
      ,,,
      COMPILERS,.. _compilers-support-compatibility-matrix:,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25224,19.0.0.25224,18.0.0.24455
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25224,19.0.0.25224,18.0.0.24491
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25224,19.0.0.25224,18.0.0.24491
+      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25314,19.0.0.25224,18.0.0.24455
+      :doc:`llvm-project <llvm-project:index>`,20.0.0.25314,19.0.0.25224,18.0.0.24491
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25314,19.0.0.25224,18.0.0.24491
      ,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43484,6.4.43484,6.3.42131
-      :doc:`HIP <hip:index>`,6.4.43484,6.4.43484,6.3.42131
+      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51830,6.4.43484,6.3.42131
+      :doc:`HIP <hip:index>`,7.0.51830,6.4.43484,6.3.42131
      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.14.0
-
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.15.0,1.14.0

 .. rubric:: Footnotes

-.. [#mi300x] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
-.. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+.. [#rhel-700] RHEL 8.10 is only supported on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
+.. [#ol-700-mi300x] **For ROCm 7.0** - Oracle Linux 9 is supported only on AMD Instinct MI300X, MI350X, and MI355X. Oracle Linux 8 is only supported on AMD Instinct MI300X.
+.. [#ol-mi300x] **Prior ROCm 7.0** - Oracle Linux is supported only on AMD Instinct MI300X.
+.. [#sles-db-700] SLES 15 SP7 and Debian 12 are only supported on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
+.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710.
+.. [#rl-700] Rocky Linux 9 is only supported on AMD Instinct MI300X and MI300A GPUs.
+.. [#single-node] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+.. [#az-mi300x] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710.
 .. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
 .. [#7700XT-OS] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
-.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.


@@ -174,28 +184,30 @@ Use this lookup table to confirm which operating system and kernel versions are
   :widths: 40, 20, 30, 20
   :stub-columns: 1

-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 GA, 6.11 HWE", 2.39
+   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.3, "6.8 [GA], 6.14 [HWE]", 2.39
   ,,
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 GA, 6.8 HWE", 2.35
+   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 [GA], 6.11 [HWE]", 2.39
   ,,
-   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14+, 2.34
+   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
+   ,,
+   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14.0-570, 2.34
   ,9.5, 5.14+, 2.34
-   ,9.4, 5.14+, 2.34
-   ,9.3, 5.14+, 2.34
+   ,9.4, 5.14.0-427, 2.34
   ,,
-   `Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0+, 2.28
-   ,8.9, 4.18.0, 2.28
+   `Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0-553, 2.28
   ,,
-   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.11.0+, 2.38
+   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.40-150700.51, 2.38
   ,15 SP6, "6.5.0+, 6.4.0", 2.38
   ,15 SP5, 5.14.21, 2.31
   ,,
-   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 9, 5.15.0 (UEK), 2.35
+   `Rocky Linux <https://wiki.rockylinux.org/rocky/version/>`_, 9, 5.14.0-570, 2.34
+   ,,
+   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 9, 6.12.0 (UEK), 2.34
   ,8, 5.15.0 (UEK), 2.28
   ,,
-   `Debian <https://www.debian.org/download>`_,12, 6.1, 2.36
+   `Debian <https://www.debian.org/download>`_,12, 6.1.0, 2.36
   ,,
-   `Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.60, 2.38
+   `Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.92, 2.38
   ,,

 .. note::
@@ -228,8 +240,11 @@ Expand for full historical view of:

   .. rubric:: Footnotes

-   .. [#mi300x-past-60] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
-   .. [#single-node-past-60] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+   .. [#ol-700-mi300x-past-60] **For ROCm 7.0.0** - Oracle Linux 9 is supported only on AMD Instinct MI300X, MI350X, and MI355X. Oracle Linux 8 is only supported on AMD Instinct MI300X.
+   .. [#mi300x-past-60] **Prior to ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X.
+   .. [#single-node-past-60] **Prior to ROCm 7.0.0 ** - Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+   .. [#az-mi300x-past-60] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710.
+   .. [#az-mi300x-630-past-60] **Prior ROCm 6.4.0**- Azure Linux 3.0 is supported only on AMD Instinct MI300X.
   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
   .. [#7700XT-OS-past-60] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
   .. [#mi300_624-past-60] **For ROCm 6.2.4** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
@@ -246,6 +261,8 @@ Expand for full historical view of:
   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
   .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
+   .. [#ray_compat] Ray is only supported on ROCm 6.4.1.
+   .. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -27,7 +27,7 @@ with ROCm support:
  - Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
    with ROCm and JAX preinstalled.

-  - ROCm JAX repository: `ROCm/jax <https://github.com/ROCm/jax>`_
+  - ROCm JAX repository: `ROCm/rocm-jax <https://github.com/ROCm/rocm-jax>`_

  - See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
    to get started.
@@ -310,5 +310,54 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
  Since version 0.1.56, JAX has full support for ROCm, and the
  :ref:`Known issues and important notes <jax_comp_known_issues>` section
  contains details about limitations specific to the ROCm backend. The list of
-  JAX API modules is maintained by the JAX project and is subject to change. 
+  JAX API modules are maintained by the JAX project and is subject to change.
  Refer to the official Jax documentation for the most up-to-date information.
+
+Key features and enhancements for ROCm 7.0
+===============================================================================
+
+- Upgraded XLA backend: Integrates a newer XLA version, enabling better
+  optimizations, broader operator support, and potential performance gains.
+
+- RNN support: Native RNN support (including LSTMs via ``jax.experimental.rnn``)
+  now available on ROCm, aiding sequence model development.
+
+- Comprehensive linear algebra capabilities: Offers robust ``jax.linalg``
+  operations, essential for scientific and machine learning tasks.
+
+- Expanded AMD GPU architecture support: Provides ongoing support for gfx1101
+  GPUs and introduces support for gfx950 and gfx12xx GPUs.
+
+- Mixed FP8 precision support: Enables ``lax.dot_general`` operations with mixed FP8
+  types, offering pathways for memory and compute efficiency.
+
+- Streamlined PyPi packaging: Provides reliable PyPi wheels for JAX on ROCm,
+  simplifying the installation process.
+
+- Pallas experimental kernel development: Continued Pallas framework
+  enhancements for custom GPU kernels, including new intrinsics (specific
+  kernel behaviors under review).
+
+- Improved build system and CI: Enhanced ROCm build system and CI for greater
+  reliability and maintainability.
+
+- Enhanced distributed computing setup: Improved JAX setup in multi-GPU
+  distributed environments.
+
+.. _jax_comp_known_issues:
+
+Known issues and notes for ROCm 7.0
+===============================================================================
+
+- ``nn.dot_product_attention``: Certain configurations of ``jax.nn.dot_product_attention``
+  may cause segmentation faults, though the majority of use cases work correctly.
+
+- SVD with dynamic shapes: SVD on inputs with dynamic/symbolic shapes might result in an error.
+  SVD with static shapes is unaffected.
+
+- QR decomposition with symbolic shapes: QR decomposition operations may fail when using
+  symbolic/dynamic shapes in shape polymorphic contexts.
+
+- Pallas kernels: Specific advanced Pallas kernels may exhibit variations in
+  numerical output or resource usage. These are actively reviewed as part of
+  Pallas's experimental development.
--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -0,0 +1,156 @@
+:orphan:
+
+.. meta::
+    :description: llama.cpp deep learning framework compatibility
+    :keywords: GPU, GGML, llama.cpp compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+llama.cpp compatibility
+********************************************************************************
+
+`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework 
+for Large Language Model (LLM) inference that runs on both central processing units 
+(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing 
+a simple, dependency-free setup. 
+
+The framework supports multiple quantization options, from 1.5-bit to 8-bit integers, 
+to speed up inference and reduce memory usage. Originally built as a CPU-first library, 
+llama.cpp is easy to integrate with other programming environments and is widely 
+adopted across diverse platforms, including consumer devices. 
+
+ROCm support for llama.cpp is upstreamed, and you can build the official source code
+with ROCm support:
+
+- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp 
+  <https://github.com/ROCm/llama.cpp>`_ repository.
+
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
+
+- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`, 
+  which includes ROCm, llama.cpp, and all required dependencies.
+
+  - See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>` 
+    to install and get started.
+
+  - See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__ 
+    in the upstream llama.cpp documentation.
+
+.. note::
+
+  llama.cpp is supported on ROCm 6.4.0.
+
+Supported devices
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210
+
+
+Use cases and recommendations
+================================================================================
+
+llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
+
+- Plain C/C++ implementation with no external dependencies
+- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
+- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
+- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
+
+llama.cpp is also used in a range of real-world applications, including:
+
+- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
+  A simple maze game where AI-controlled agents attempt to trick the player.
+- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
+  A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
+- Various other AI applications use llama.cpp as their inference engine;  
+  for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
+
+For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
+
+- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__, 
+  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
+  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
+  AMD Instinct GPUs within the ROCm ecosystem. 
+
+.. _llama-cpp-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
+with ROCm backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. important::
+
+   Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
+
+   - Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
+   - Server: This image only includes the server executable file.
+   - Light: This image only includes the main executable file.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Full Docker
+      - Server Docker
+      - Light Docker
+      - llama.cpp
+      - Ubuntu
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
+      - 24.04
+
+Key ROCm libraries for llama.cpp
+================================================================================
+
+llama.cpp functionality on ROCm is determined by its underlying library
+dependencies. These ROCm components affect the capabilities, performance, and
+feature set available to developers.
+
+.. list-table::
+    :header-rows: 1
+
+    * - ROCm library
+      - Version
+      - Purpose
+      - Usage
+    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
+      - :version-ref:`hipBLAS rocm_version`
+      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
+        matrix and vector operations.
+      - Supports operations such as matrix multiplication, matrix-vector
+        products, and tensor contractions. Utilized in both dense and batched
+        linear algebra operations.
+    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+      - :version-ref:`hipBLASLt rocm_version`
+      - hipBLASLt is an extension of the hipBLAS library, providing additional
+        features like epilogues fused into the matrix multiplication kernel or
+        use of integer tensor cores.
+      - By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
+        kernels where possible.
+    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
+      - :version-ref:`rocWMMA rocm_version`
+      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
+        multiplication (GEMM) and accumulation operations with mixed precision
+        support.
+      - Can be used to enhance the flash attention performance on AMD compute, by enabling
+        the flag during compile time.
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -366,7 +366,8 @@ feature set available to developers.
 Supported modules and data types
 ================================================================================

-The following section outlines the supported data types, modules, and domain libraries available in PyTorch on ROCm.
+The following section outlines the supported data types, modules, and domain
+libraries available in PyTorch on ROCm.

 Supported data types
 --------------------------------------------------------------------------------
@@ -533,3 +534,72 @@ with ROCm.
        dispatching.

        **Note:** Only official release exists.
+
+Key features and enhancements for PyTorch 2.7 with ROCm 7.0
+================================================================================
+
+- Enhanced TunableOp framework: Introduces ``tensorfloat32`` support for
+  TunableOp operations, improved offline tuning for ScaledGEMM operations,
+  submatrix offline tuning capabilities, and better logging for BLAS operations
+  without bias vectors.
+
+- Expanded GPU architecture support: Provides optimized support for newer GPU
+  architectures, including gfx1200 and gfx1201 with preferred hipBLASLt backend
+  selection, along with improvements for gfx950 and gfx1100 series GPUs.
+
+- Advanced Triton Integration: AOTriton 0.10b introduces official support for
+  gfx950 and gfx1201, along with experimental support for gfx1101, gfx1151,
+  gfx1150, and gfx1200.
+
+- Improved element-wise kernel performance: Delivers enhanced vectorized
+  element-wise kernels with better support for heterogeneous tensor types and
+  optimized input vectorization for tensors with mixed data types.
+
+- MIOpen deep learning optimizations: Enables NHWC BatchNorm by default on
+  ROCm 7.0+, provides ``maxpool`` forward and backward performance improvements
+  targeting ResNet scenarios, and includes updated launch configurations for
+  better performance.
+
+- Enhanced memory and tensor operations: Features fixes for in-place ``aten``
+  sum operations with specialized templated kernels, improved 3D tensor
+  performance with NHWC format, and better handling of memory-bound matrix
+  multiplication operations.
+
+- Robust testing and quality improvements: Includes comprehensive test suite
+  updates with improved tolerance handling for Navi3x architectures, generalized
+  ROCm-specific test conditions, and enhanced unit test coverage for Flash
+  Attention and Memory Efficient operations.
+
+- Build system and infrastructure improvements: Provides updated CentOS Stream 9
+  support, improved Docker configuration, migration to public MAGMA repository,
+  and enhanced QA automation scripts for PyTorch unit testing.
+
+- Composable Kernel (CK) updates: Features updated CK submodule integration with
+  the latest optimizations and performance improvements for core mathematical
+  operations.
+
+- Development and debugging enhancements: Includes improved source handling for
+  dynamic compilation, better error handling for atomic operations, and enhanced
+  state checking for trace operations.
+
+- Integrate APEX fused layer normalization, which can have positive impact on
+  text-to-video models.
+
+- Integrate APEX distributed fused LAMB and distributed fused ADAM, which can
+  have positive impact on BERT-L and Llama2-SFT.
+
+- FlashAttention v3 has been integrated for AMD GPUs.
+
+- `Pytorch C++ extensions <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_
+  provide a mechanism for compiling custom operations that can be used during
+  network training or inference. For AMD platforms, ``amdclang++`` has been
+  validated as the supported compiler for building these extensions.
+
+Known issues and notes for PyTorch 2.7 with ROCm 7.0
+================================================================================
+
+- The ``matmul.allow_fp16_reduced_precision_reduction`` and
+  ``matmul.allow_bf16_reduced_precision_reduction`` options under 
+  ``torch.backends.cuda`` are not supported. As a result, 
+  reduced-precision reductions using FP16 or BF16 accumulation types are not
+  available.
--- a/docs/compatibility/ml-compatibility/ray-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -0,0 +1,111 @@
+:orphan:
+
+.. meta::
+    :description: Ray deep learning framework compatibility
+    :keywords: GPU, Ray compatibility
+
+.. version-set:: rocm_version latest
+
+*******************************************************************************
+Ray compatibility
+*******************************************************************************
+
+Ray is a unified framework for scaling AI and Python applications from your laptop 
+to a full cluster, without changing your code. Ray consists of `a core distributed 
+runtime  <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of 
+`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for 
+simplifying machine learning computations.
+
+Ray is a general-purpose framework that runs many types of workloads efficiently. 
+Any Python application can be scaled with Ray, without extra infrastructure.
+
+ROCm support for Ray is upstreamed, and you can build the official source code
+with ROCm support: 
+
+- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray 
+  <https://github.com/ROCm/ray>`_ repository.
+
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
+
+- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>` 
+  which includes ROCm, Ray, and all required dependencies.
+
+  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
+    for instructions to get started.
+
+  - See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_ 
+    in the upstream Ray documentation.
+
+  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
+    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
+
+.. note::
+
+  Ray is supported on ROCm 6.4.1.
+
+Supported devices
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210
+
+
+Use cases and recommendations
+================================================================================
+
+* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm 
+  Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__  
+  blog provides an overview of Volcano Engine Reinforcement Learning (verl) 
+  for large language models (LLMs) and discusses its benefits in large-scale 
+  reinforcement learning from human feedback (RLHF). It uses Ray as part of a 
+  hybrid orchestration engine to schedule and coordinate training and inference 
+  tasks in parallel, enabling optimized resource utilization and potential overlap 
+  between these phases. This dynamic resource allocation strategy significantly 
+  improves overall system efficiency. The blog presents verl’s performance results, 
+  focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X 
+  GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and 
+  accelerate your RLHF training with ROCm-optimized performance.
+
+* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows 
+  <https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
+  blog post describes key use cases such as training and inference for large language models (LLMs), 
+  model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale 
+  workloads using Ray in the ROCm environment.
+
+For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support 
+topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__ 
+of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
+
+.. _ray-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories represent the latest Ray version from the official Docker Hub and are validated for
+`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
+icon to view the image on Docker Hub.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - Ray
+      - Pytorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
+      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
+      - 2.6.0+git684f6f2
+      - 24.04
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
--- a/docs/conceptual/gpu-arch.md
+++ b/docs/conceptual/gpu-arch.md
@@ -21,7 +21,8 @@ architecture.
 * [AMD Instinct™ MI300 microarchitecture](./gpu-arch/mi300.md)
 * [AMD Instinct MI300/CDNA3 ISA](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf)
 * [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf)
-* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
+* [MI300 performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
+* [MI350 series performance counters](./gpu-arch/mi350-performance-counters.rst)
 :::

 :::{grid-item-card}
--- a/docs/conceptual/gpu-arch/mi350-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi350-performance-counters.rst
@@ -0,0 +1,530 @@
+.. meta::
+  :description: MI355 series performance counters and metrics
+  :keywords: MI355, MI355X, MI3XX
+
+***********************************
+MI350 series performance counters
+***********************************
+
+This topic lists and describes the hardware performance counters and derived metrics available on the AMD Instinct MI350 and MI355 accelerators. These counters are available for profiling using `ROCprofiler-SDK <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html>`_ and `ROCm Compute Profiler <https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/>`_.
+
+The following sections list the performance counters based on the IP blocks.
+
+Command processor packet processor counters (CPC)
+==================================================
+
+.. list-table::
+    :header-rows: 1
+
+    * - Hardware counter
+      - Definition
+
+    * - CPC_ALWAYS_COUNT
+      - Always count.
+
+    * - CPC_ADC_VALID_CHUNK_NOT_AVAIL
+      - ADC valid chunk is not available when dispatch walking is in progress in the multi-xcc mode.
+
+    * - CPC_ADC_DISPATCH_ALLOC_DONE
+      - ADC dispatch allocation is done.
+
+    * - CPC_ADC_VALID_CHUNK_END
+      - ADC crawler's valid chunk end in the multi-xcc mode.
+
+    * - CPC_SYNC_FIFO_FULL_LEVEL
+      - SYNC FIFO full last cycles.
+
+    * - CPC_SYNC_FIFO_FULL
+      - SYNC FIFO full times.
+
+    * - CPC_GD_BUSY
+      - ADC busy.
+
+    * - CPC_TG_SEND
+      - ADC thread group send.
+
+    * - CPC_WALK_NEXT_CHUNK
+      - ADC walking next valid chunk in the multi-xcc mode.
+
+    * - CPC_STALLED_BY_SE0_SPI
+      - ADC CSDATA stalled by SE0SPI.
+
+    * - CPC_STALLED_BY_SE1_SPI
+      - ADC CSDATA stalled by SE1SPI.
+
+    * - CPC_STALLED_BY_SE2_SPI
+      - ADC CSDATA stalled by SE2SPI.
+
+    * - CPC_STALLED_BY_SE3_SPI
+      - ADC CSDATA stalled by SE3SPI.
+
+    * - CPC_LTE_ALL
+      - CPC sync counter LteAll. Only Master XCD manages LteAll.
+
+    * - CPC_SYNC_WRREQ_FIFO_BUSY
+      - CPC sync counter request FIFO is not empty.
+
+    * - CPC_CANE_BUSY
+      - CPC CANE bus is busy, which indicates the presence of inflight sync counter requests.
+
+    * - CPC_CANE_STALL
+      - CPC sync counter sending is stalled by CANE.
+
+Shader pipe interpolators (SPI) counters
+=========================================
+
+.. list-table::
+    :header-rows: 1
+
+    * - Hardware counter
+      - Definition
+
+    * - SPI_CS0_WINDOW_VALID
+      - Clock count enabled by PIPE0 perfcounter_start event.
+
+    * - SPI_CS0_BUSY
+      - Number of clocks with outstanding waves for PIPE0 (SPI or SH).
+
+    * - SPI_CS0_NUM_THREADGROUPS
+      - Number of thread groups launched for PIPE0.
+
+    * - SPI_CS0_CRAWLER_STALL
+      - Number of clocks when PIPE0 event or wave order FIFO is full.
+
+    * - SPI_CS0_EVENT_WAVE
+      - Number of PIPE0 events and waves.
+
+    * - SPI_CS0_WAVE
+      - Number of PIPE0 waves.
+
+    * - SPI_CS1_WINDOW_VALID
+      - Clock count enabled by PIPE1 perfcounter_start event.
+
+    * - SPI_CS1_BUSY
+      - Number of clocks with outstanding waves for PIPE1 (SPI or SH).
+
+    * - SPI_CS1_NUM_THREADGROUPS
+      - Number of thread groups launched for PIPE1.
+
+    * - SPI_CS1_CRAWLER_STALL
+      - Number of clocks when PIPE1 event or wave order FIFO is full.
+
+    * - SPI_CS1_EVENT_WAVE
+      - Number of PIPE1 events and waves.
+
+    * - SPI_CS1_WAVE
+      - Number of PIPE1 waves.
+
+    * - SPI_CS2_WINDOW_VALID
+      - Clock count enabled by PIPE2 perfcounter_start event.
+
+    * - SPI_CS2_BUSY
+      - Number of clocks with outstanding waves for PIPE2 (SPI or SH).
+
+    * - SPI_CS2_NUM_THREADGROUPS
+      - Number of thread groups launched for PIPE2.
+
+    * - SPI_CS2_CRAWLER_STALL
+      - Number of clocks when PIPE2 event or wave order FIFO is full.
+
+    * - SPI_CS2_EVENT_WAVE
+      - Number of PIPE2 events and waves.
+
+    * - SPI_CS2_WAVE
+      - Number of PIPE2 waves.
+
+    * - SPI_CS3_WINDOW_VALID
+      - Clock count enabled by PIPE3 perfcounter_start event.
+
+    * - SPI_CS3_BUSY
+      - Number of clocks with outstanding waves for PIPE3 (SPI or SH).
+
+    * - SPI_CS3_NUM_THREADGROUPS
+      - Number of thread groups launched for PIPE3.
+
+    * - SPI_CS3_CRAWLER_STALL
+      - Number of clocks when PIPE3 event or wave order FIFO is full.
+
+    * - SPI_CS3_EVENT_WAVE
+      - Number of PIPE3 events and waves.
+
+    * - SPI_CS3_WAVE
+      - Number of PIPE3 waves.
+
+    * - SPI_CSQ_P0_Q0_OCCUPANCY
+      - Sum of occupancy info for PIPE0 Queue0.
+
+    * - SPI_CSQ_P0_Q1_OCCUPANCY
+      - Sum of occupancy info for PIPE0 Queue1.
+
+    * - SPI_CSQ_P0_Q2_OCCUPANCY
+      - Sum of occupancy info for PIPE0 Queue2.
+
+    * - SPI_CSQ_P0_Q3_OCCUPANCY
+      - Sum of occupancy info for PIPE0 Queue3.
+
+    * - SPI_CSQ_P0_Q4_OCCUPANCY
+      - Sum of occupancy info for PIPE0 Queue4.
+
+    * - SPI_CSQ_P0_Q5_OCCUPANCY
+      - Sum of occupancy info for PIPE0 Queue5.
+
+    * - SPI_CSQ_P0_Q6_OCCUPANCY
+      - Sum of occupancy info for PIPE0 Queue6.
+
+    * - SPI_CSQ_P0_Q7_OCCUPANCY
+      - Sum of occupancy info for PIPE0 Queue7.
+
+    * - SPI_CSQ_P1_Q0_OCCUPANCY
+      - Sum of occupancy info for PIPE1 Queue0.
+
+    * - SPI_CSQ_P1_Q1_OCCUPANCY
+      - Sum of occupancy info for PIPE1 Queue1.
+
+    * - SPI_CSQ_P1_Q2_OCCUPANCY
+      - Sum of occupancy info for PIPE1 Queue2.
+
+    * - SPI_CSQ_P1_Q3_OCCUPANCY
+      - Sum of occupancy info for PIPE1 Queue3.
+
+    * - SPI_CSQ_P1_Q4_OCCUPANCY
+      - Sum of occupancy info for PIPE1 Queue4.
+
+    * - SPI_CSQ_P1_Q5_OCCUPANCY
+      - Sum of occupancy info for PIPE1 Queue5.
+
+    * - SPI_CSQ_P1_Q6_OCCUPANCY
+      - Sum of occupancy info for PIPE1 Queue6.
+
+    * - SPI_CSQ_P1_Q7_OCCUPANCY
+      - Sum of occupancy info for PIPE1 Queue7.
+
+    * - SPI_CSQ_P2_Q0_OCCUPANCY
+      - Sum of occupancy info for PIPE2 Queue0.
+
+    * - SPI_CSQ_P2_Q1_OCCUPANCY
+      - Sum of occupancy info for PIPE2 Queue1.
+
+    * - SPI_CSQ_P2_Q2_OCCUPANCY
+      - Sum of occupancy info for PIPE2 Queue2.
+
+    * - SPI_CSQ_P2_Q3_OCCUPANCY
+      - Sum of occupancy info for PIPE2 Queue3.
+
+    * - SPI_CSQ_P2_Q4_OCCUPANCY
+      - Sum of occupancy info for PIPE2 Queue4.
+
+    * - SPI_CSQ_P2_Q5_OCCUPANCY
+      - Sum of occupancy info for PIPE2 Queue5.
+
+    * - SPI_CSQ_P2_Q6_OCCUPANCY
+      - Sum of occupancy info for PIPE2 Queue6.
+
+    * - SPI_CSQ_P2_Q7_OCCUPANCY
+      - Sum of occupancy info for PIPE2 Queue7.
+
+    * - SPI_CSQ_P3_Q0_OCCUPANCY
+      - Sum of occupancy info for PIPE3 Queue0.
+
+    * - SPI_CSQ_P3_Q1_OCCUPANCY
+      - Sum of occupancy info for PIPE3 Queue1.
+
+    * - SPI_CSQ_P3_Q2_OCCUPANCY
+      - Sum of occupancy info for PIPE3 Queue2.
+
+    * - SPI_CSQ_P3_Q3_OCCUPANCY
+      - Sum of occupancy info for PIPE3 Queue3.
+
+    * - SPI_CSQ_P3_Q4_OCCUPANCY
+      - Sum of occupancy info for PIPE3 Queue4.
+
+    * - SPI_CSQ_P3_Q5_OCCUPANCY
+      - Sum of occupancy info for PIPE3 Queue5.
+
+    * - SPI_CSQ_P3_Q6_OCCUPANCY
+      - Sum of occupancy info for PIPE3 Queue6.
+
+    * - SPI_CSQ_P3_Q7_OCCUPANCY
+      - Sum of occupancy info for PIPE3 Queue7.
+
+    * - SPI_CSQ_P0_OCCUPANCY
+      - Sum of occupancy info for all PIPE0 queues.
+
+    * - SPI_CSQ_P1_OCCUPANCY
+      - Sum of occupancy info for all PIPE1 queues.
+
+    * - SPI_CSQ_P2_OCCUPANCY
+      - Sum of occupancy info for all PIPE2 queues.
+
+    * - SPI_CSQ_P3_OCCUPANCY
+      - Sum of occupancy info for all PIPE3 queues.
+
+    * - SPI_VWC0_VDATA_VALID_WR
+      - Number of clocks VGPR bus_0 writes VGPRs.
+
+    * - SPI_VWC1_VDATA_VALID_WR
+      - Number of clocks VGPR bus_1 writes VGPRs.
+
+    * - SPI_CSC_WAVE_CNT_BUSY
+      - Number of cycles when there is any wave in the pipe.
+
+Compute unit (SQ) counters
+===========================
+
+.. list-table::
+    :header-rows: 1
+
+    * - Hardware counter
+      - Definition
+
+    * - SQ_INSTS_VALU_MFMA_F6F4
+      - Number of VALU V_MFMA_*_F6F4 instructions.
+
+    * - SQ_INSTS_VALU_MFMA_MOPS_F6F4
+      - Number of VALU matrix with the performed math operations (add or mul) divided by 512, assuming a full EXEC mask of F6 or F4 data type.
+
+    * - SQ_ACTIVE_INST_VALU2
+      - Number of quad-cycles when two VALU instructions are issued (per-simd, nondeterministic).
+
+    * - SQ_INSTS_LDS_LOAD
+      - Number of LDS load instructions issued (per-simd, emulated).
+
+    * - SQ_INSTS_LDS_STORE
+      - Number of LDS store instructions issued (per-simd, emulated).
+
+    * - SQ_INSTS_LDS_ATOMIC
+      - Number of LDS atomic instructions issued (per-simd, emulated).
+
+    * - SQ_INSTS_LDS_LOAD_BANDWIDTH
+      - Total number of 64-bytes loaded (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
+
+    * - SQ_INSTS_LDS_STORE_BANDWIDTH
+      - Total number of 64-bytes written (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
+
+    * - SQ_INSTS_LDS_ATOMIC_BANDWIDTH
+      - Total number of 64-bytes atomic (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
+
+    * - SQ_INSTS_VALU_FLOPS_FP16
+      - Counts FLOPS per instruction on float 16 excluding MFMA/SMFMA.
+
+    * - SQ_INSTS_VALU_FLOPS_FP32
+      - Counts FLOPS per instruction on float 32 excluding MFMA/SMFMA.
+
+    * - SQ_INSTS_VALU_FLOPS_FP64
+      - Counts FLOPS per instruction on float 64 excluding MFMA/SMFMA.
+
+    * - SQ_INSTS_VALU_FLOPS_FP16_TRANS
+      - Counts FLOPS per instruction on float 16 trans excluding MFMA/SMFMA.
+
+    * - SQ_INSTS_VALU_FLOPS_FP32_TRANS
+      - Counts FLOPS per instruction on float 32 trans excluding MFMA/SMFMA.
+
+    * - SQ_INSTS_VALU_FLOPS_FP64_TRANS
+      - Counts FLOPS per instruction on float 64 trans excluding MFMA/SMFMA.
+
+    * - SQ_INSTS_VALU_IOPS
+      - Counts OPS per instruction on integer or unsigned or bit data (per-simd, emulated).
+
+    * - SQ_LDS_DATA_FIFO_FULL
+      - Number of cycles LDS data FIFO is full (nondeterministic, unwindowed).
+
+    * - SQ_LDS_CMD_FIFO_FULL
+      - Number of cycles LDS command FIFO is full (nondeterministic, unwindowed).
+
+    * - SQ_VMEM_TA_ADDR_FIFO_FULL
+      - Number of cycles texture requests are stalled due to full address FIFO in TA (nondeterministic, unwindowed).
+
+    * - SQ_VMEM_TA_CMD_FIFO_FULL
+      - Number of cycles texture requests are stalled due to full cmd FIFO in TA (nondeterministic, unwindowed).
+
+    * - SQ_VMEM_WR_TA_DATA_FIFO_FULL
+      - Number of cycles texture writes are stalled due to full data FIFO in TA (nondeterministic, unwindowed).
+
+    * - SQC_ICACHE_MISSES_DUPLICATE
+      - Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
+
+    * - SQC_DCACHE_MISSES_DUPLICATE
+      - Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
+
+Texture addressing (TA) unit counters
+======================================
+
+.. list-table::
+    :header-rows: 1
+
+    * - Hardware counter
+      - Definition
+
+    * - TA_BUFFER_READ_LDS_WAVEFRONTS
+      - Number of buffer read wavefronts for LDS return processed by the TA.
+
+    * - TA_FLAT_READ_LDS_WAVEFRONTS
+      - Number of flat opcode reads for LDS return processed by the TA.
+
+Texture data (TD) unit counters
+================================
+
+.. list-table::
+    :header-rows: 1
+
+    * - Hardware counter
+      - Definition
+
+    * - TD_WRITE_ACKT_WAVEFRONT
+      - Number of write acknowledgments, sent to SQ and not to SP.
+
+    * - TD_TD_SP_TRAFFIC
+      - Number of times this TD sends data to the SP.
+
+Texture cache per pipe (TCP) counters
+======================================
+
+.. list-table::
+    :header-rows: 1
+
+    * - Hardware counter
+      - Definition
+
+    * - TCP_TCP_TA_ADDR_STALL_CYCLES
+      - TCP stalls TA addr interface.
+
+    * - TCP_TCP_TA_DATA_STALL_CYCLES
+      - TCP stalls TA data interface. Now windowed.
+
+    * - TCP_LFIFO_STALL_CYCLES
+      - Memory latency FIFOs full stall.
+
+    * - TCP_RFIFO_STALL_CYCLES
+      - Memory Request FIFOs full stall.
+
+    * - TCP_TCR_RDRET_STALL
+      - Write into cache stalled by read return from TCR.
+
+    * - TCP_PENDING_STALL_CYCLES
+      - Stall due to data pending from L2.
+
+    * - TCP_UTCL1_SERIALIZATION_STALL
+      - Total number of stalls caused due to serializing translation requests through the UTCL1.
+
+    * - TCP_UTCL1_THRASHING_STALL
+      - Stall caused by thrashing feature in any probe. Lacks accuracy when the stall signal overlaps between probe0 and probe1, which is worse with MECO of thrashing deadlock. Some probe0 events could miss being counted in with MECO on. This perf count provides a rough thrashing estimate.
+
+    * - TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS
+      - Translation miss_under_miss.
+
+    * - TCP_UTCL1_STALL_INFLIGHT_MAX
+      - Total UTCL1 stalls due to inflight counter saturation.
+
+    * - TCP_UTCL1_STALL_LRU_INFLIGHT
+      - Total UTCL1 stalls due to LRU cache line with inflight traffic.
+
+    * - TCP_UTCL1_STALL_MULTI_MISS
+      - Total UTCL1 stalls due to arbitrated multiple misses.
+
+    * - TCP_UTCL1_LFIFO_FULL
+      - Total UTCL1 and UTCL2 latency, which hides FIFO full cycles.
+
+    * - TCP_UTCL1_STALL_LFIFO_NOT_RES
+      - Total UTCL1 stalls due to UTCL2 latency, which hides FIFO output (not resident).
+
+    * - TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS
+      - Total UTCL1 stalls due to UTCL2_req being out of credits.
+
+    * - TCP_CLIENT_UTCL1_INFLIGHT
+      - The sum of inflight client to UTCL1 requests per cycle.
+
+    * - TCP_TAGRAM0_REQ
+      - Total L2 requests mapping to TagRAM 0 from this TCP to all TCCs.
+
+    * - TCP_TAGRAM1_REQ
+      - Total L2 requests mapping to TagRAM 1 from this TCP to all TCCs.
+
+    * - TCP_TAGRAM2_REQ
+      - Total L2 requests mapping to TagRAM 2 from this TCP to all TCCs.
+
+    * - TCP_TAGRAM3_REQ
+      - Total L2 requests mapping to TagRAM 3 from this TCP to all TCCs.
+
+    * - TCP_TCP_LATENCY
+      - Total TCP wave latency (from the first clock of wave entering to the first clock of wave leaving). Divide by TA_TCP_STATE_READ to find average wave latency.
+
+    * - TCP_TCC_READ_REQ_LATENCY
+      - Total TCP to TCC request latency for reads and atomics with return. Not Windowed.
+
+    * - TCP_TCC_WRITE_REQ_LATENCY
+      - Total TCP to TCC request latency for writes and atomics without return. Not Windowed.
+
+    * - TCP_TCC_WRITE_REQ_HOLE_LATENCY
+      - Total TCP req to TCC hole latency for writes and atomics. Not Windowed.
+
+Texture cache per channel (TCC) counters
+=========================================
+
+.. list-table::
+    :header-rows: 1
+
+    * - Hardware counter
+      - Definition
+
+    * - TCC_READ_SECTORS
+      - Total number of 32B data sectors in read requests.
+
+    * - TCC_WRITE_SECTORS
+      - Total number of 32B data sectors in write requests.
+
+    * - TCC_ATOMIC_SECTORS
+      - Total number of 32B data sectors in atomic requests.
+
+    * - TCC_BYPASS_REQ
+      - Number of bypass requests. This is measured at the tag block.
+
+    * - TCC_LATENCY_FIFO_FULL
+      - Number of cycles when the latency FIFO is full.
+
+    * - TCC_SRC_FIFO_FULL
+      - Number of cycles when the SRC FIFO is assumed to be full as measured at the IB block.
+
+    * - TCC_EA0_RDREQ_64B
+      - Number of 64-byte TCC/EA read requests.
+
+    * - TCC_EA0_RDREQ_128B
+      - Number of 128-byte TCC/EA read requests.
+
+    * - TCC_IB_REQ
+      - Number of requests through the IB. This measures the number of raw requests from graphics clients to this TCC.
+
+    * - TCC_IB_STALL
+      - Number of cycles when the IB output is stalled.
+
+    * - TCC_EA0_WRREQ_WRITE_DRAM
+      - Number of TCC/EA write requests (32-byte or 64-byte) destined for DRAM (MC).
+
+    * - TCC_EA0_WRREQ_ATOMIC_DRAM
+      - Number of TCC/EA atomic requests (32-byte or 64-byte) destined for DRAM (MC).
+
+    * - TCC_EA0_RDREQ_DRAM_32B
+      - Number of 32-byte TCC/EA read requests due to DRAM traffic. One 64-byte request is counted as two and one 128-byte as four.
+
+    * - TCC_EA0_RDREQ_GMI_32B
+      - Number of 32-byte TCC/EA read requests due to GMI traffic. One 64-byte request is counted as two and one 128-byte as four.
+
+    * - TCC_EA0_RDREQ_IO_32B
+      - Number of 32-byte TCC/EA read requests due to IO traffic. One 64-byte request is counted as two and one 128-byte as four.
+
+    * - TCC_EA0_WRREQ_WRITE_DRAM_32B
+      - Number of 32-byte TCC/EA write requests due to DRAM traffic. One 64-byte request is counted as two.
+
+    * - TCC_EA0_WRREQ_ATOMIC_DRAM_32B
+      - Number of 32-byte TCC/EA atomic requests due to DRAM traffic. One 64-byte request is counted as two.
+
+    * - TCC_EA0_WRREQ_WRITE_GMI_32B
+      - Number of 32-byte TCC/EA write requests due to GMI traffic. One 64-byte request is counted as two.
+
+    * - TCC_EA0_WRREQ_ATOMIC_GMI_32B
+      - Number of 32-byte TCC/EA atomic requests due to GMI traffic. One 64-byte request is counted as two.
+
+    * - TCC_EA0_WRREQ_WRITE_IO_32B
+      - Number of 32-byte TCC/EA write requests due to IO traffic. One 64-byte request is counted as two.
+
+    * - TCC_EA0_WRREQ_ATOMIC_IO_32B
+      - Number of 32-byte TCC/EA atomic requests due to IO traffic. One 64-byte request is counted as two.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -89,15 +89,15 @@ project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "6.4.3"
-release = "6.4.3"
+version = "7.0.0"
+release = "7.0.0"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-08-07"},
+    {"file": "about/release-notes", "os": ["linux"], "date": "2025-09-16"},
    {"file": "release/changelog", "os": ["linux"],},
    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
@@ -108,6 +108,8 @@ article_pages = [
    {"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
@@ -124,14 +126,19 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
@@ -156,6 +163,8 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

--- a/docs/data/about/compatibility/floating-point-data-types.png
+++ b/docs/data/about/compatibility/floating-point-data-types.png
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
@@ -0,0 +1,91 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
+      rocm_version: 6.4.1
+      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+      - model: Qwen3 30B A3B
+        mad_tag: pyt_vllm_qwen3-30b-a3b
+        model_repo: Qwen/Qwen3-30B-A3B
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
--- a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
@@ -1,17 +1,16 @@
-sglang_benchmark:
-  unified_docker:
-    latest:
-      pull_tag: lmsysorg/sglang:v0.4.5-rocm630
-      docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
-      rocm_version: 6.3.0
-      sglang_version: 0.4.5 (0.4.5-rocm)
-      pytorch_version: 2.6.0a0+git8d4926e
-  model_groups:
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek-R1-Distill-Qwen-32B
-        mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
-        model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-        precision: bfloat16
+dockers:
+  - pull_tag: lmsysorg/sglang:v0.4.5-rocm630
+    docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
+    components:
+      ROCm: 6.3.0
+      SGLang: 0.4.5 (0.4.5-rocm)
+      PyTorch: 2.6.0a0+git8d4926e
+model_groups:
+  - group: DeepSeek
+    tag: deepseek
+    models:
+    - model: DeepSeek-R1-Distill-Qwen-32B
+      mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
+      model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+      url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+      precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,88 +1,188 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      # TODO: update me
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
-      rocm_version: 6.4.1
-      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
-      pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
-      hipblaslt_version: 0.15
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-      - model: Llama 3.1 70B
-        mad_tag: pyt_vllm_llama-3.1-70b
-        model_repo: meta-llama/Llama-3.1-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-        precision: float16
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 70B FP8
-        mad_tag: pyt_vllm_llama-3.1-70b_fp8
-        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-    - group: Mistral AI
-      tag: mistral
-      models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-    - group: Qwen
-      tag: qwen
-      models:
-      - model: QwQ-32B
-        mad_tag: pyt_vllm_qwq-32b
-        model_repo: Qwen/QwQ-32B
-        url: https://huggingface.co/Qwen/QwQ-32B
-        precision: float16
-        tunableop: true
-    - group: Microsoft Phi
-      tag: phi
-      models:
-      - model: Phi-4
-        mad_tag: pyt_vllm_phi-4
-        model_repo: microsoft/phi-4
-        url: https://huggingface.co/microsoft/phi-4
+dockers:
+  - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
+    components:
+      ROCm: 6.4.1
+      vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
+      PyTorch: 2.7.0+gitf717b2a
+      hipBLASLt: 0.15
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_vllm_llama-3.1-8b
+      model_repo: meta-llama/Llama-3.1-8B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 70B
+      mad_tag: pyt_vllm_llama-3.1-70b
+      model_repo: meta-llama/Llama-3.1-70B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 405B
+      mad_tag: pyt_vllm_llama-3.1-405b
+      model_repo: meta-llama/Llama-3.1-405B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 2 70B
+      mad_tag: pyt_vllm_llama-2-70b
+      model_repo: meta-llama/Llama-2-70b-chat-hf
+      url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 4096
+        max_num_batched_tokens: 4096
+        max_model_len: 4096
+    - model: Llama 3.1 8B FP8
+      mad_tag: pyt_vllm_llama-3.1-8b_fp8
+      model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 70B FP8
+      mad_tag: pyt_vllm_llama-3.1-70b_fp8
+      model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 405B FP8
+      mad_tag: pyt_vllm_llama-3.1-405b_fp8
+      model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+  - group: Mistral AI
+    tag: mistral
+    models:
+    - model: Mixtral MoE 8x7B
+      mad_tag: pyt_vllm_mixtral-8x7b
+      model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+      url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+    - model: Mixtral MoE 8x22B
+      mad_tag: pyt_vllm_mixtral-8x22b
+      model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+      url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
+    - model: Mixtral MoE 8x7B FP8
+      mad_tag: pyt_vllm_mixtral-8x7b_fp8
+      model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+      url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+    - model: Mixtral MoE 8x22B FP8
+      mad_tag: pyt_vllm_mixtral-8x22b_fp8
+      model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+      url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: QwQ-32B
+      mad_tag: pyt_vllm_qwq-32b
+      model_repo: Qwen/QwQ-32B
+      url: https://huggingface.co/Qwen/QwQ-32B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Qwen3 30B A3B
+      mad_tag: pyt_vllm_qwen3-30b-a3b
+      model_repo: Qwen/Qwen3-30B-A3B
+      url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+  - group: Microsoft Phi
+    tag: phi
+    models:
+    - model: Phi-4
+      mad_tag: pyt_vllm_phi-4
+      model_repo: microsoft/phi-4
+      url: https://huggingface.co/microsoft/phi-4
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 16384
+        max_num_batched_tokens: 16384
+        max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -0,0 +1,72 @@
+dockers:
+  - pull_tag: rocm/jax-training:maxtext-v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.5.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.x.x
+  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.6.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.1.0-499ece1c21
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: jax_maxtext_train_llama-3.3-70b
+        model_repo: Llama-3.3-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 8B
+        mad_tag: jax_maxtext_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 70B
+        mad_tag: jax_maxtext_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3 8B
+        mad_tag: jax_maxtext_train_llama-3-8b
+        multinode_training_script: llama3_8b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3 70B
+        mad_tag: jax_maxtext_train_llama-3-70b
+        multinode_training_script: llama3_70b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 2 7B
+        mad_tag: jax_maxtext_train_llama-2-7b
+        model_repo: Llama-2-7B
+        precision: bf16
+        multinode_training_script: llama2_7b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+      - model: Llama 2 70B
+        mad_tag: jax_maxtext_train_llama-2-70b
+        model_repo: Llama-2-70B
+        precision: bf16
+        multinode_training_script: llama2_70b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V2-Lite (16B)
+        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
+        model_repo: DeepSeek-V2-lite
+        precision: bf16
+        doc_options: ["single-node"]
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: jax_maxtext_train_mixtral-8x7b
+        model_repo: Mixtral-8x7B
+        precision: bf16
+        doc_options: ["single-node"]
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
@@ -0,0 +1,120 @@
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
+model_groups:
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora, HF_finetune_lora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,38 +1,17 @@
-unified_docker:
-  latest:
-    pull_tag: rocm/pytorch-training:v25.6
-    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
-    rocm_version: 6.4.1
-    pytorch_version: 2.8.0a0+git7d205b2
-    python_version: 3.10.17
-    transformer_engine_version: 1.14.0+2f85f5f2
-    flash_attention_version: 3.0.0.post1
-    hipblaslt_version: 0.15.0-8c6919d
-    triton_version: 3.3.0
+dockers:
+  - pull_tag: rocm/pytorch-training:v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
+    components:
+      ROCm: 6.4.2
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+94e53dd8
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-4b9a52edfc
+      Triton: 3.3.0
 model_groups:
-  - group: Pre-training
-    tag: pre-training
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain]
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: Fine-tuning
-    tag: fine-tuning
+  - group: Meta Llama
+    tag: llama
    models:
    - model: Llama 4 Scout 17B-16E
      mad_tag: pyt_train_llama-4-scout-17b-16e
@@ -75,19 +54,19 @@ model_groups:
      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
+      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
    - model: Llama 3.1 70B
      mad_tag: pyt_train_llama-3.1-70b
      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+      training_modes: [pretrain, finetune_fw, finetune_lora]
    - model: Llama 3.1 405B
      mad_tag: pyt_train_llama-3.1-405b
      model_repo: Llama-3.1-405B
      url: https://huggingface.co/meta-llama/Llama-3.1-405B
      precision: BF16
-      training_modes: [finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_qlora]
    - model: Llama 3 8B
      mad_tag: pyt_train_llama-3-8b
      model_repo: Llama-3-8B
@@ -117,4 +96,67 @@ model_groups:
      model_repo: Llama-2-70B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_lora, finetune_qlora]
+  - group: OpenAI
+    tag: openai
+    models:
+    - model: GPT OSS 20B
+      mad_tag: pyt_train_gpt_oss_20b
+      model_repo: GPT-OSS-20B
+      url: https://huggingface.co/openai/gpt-oss-20b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+    - model: GPT OSS 120B
+      mad_tag: pyt_train_gpt_oss_120b
+      model_repo: GPT-OSS-120B
+      url: https://huggingface.co/openai/gpt-oss-120b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: Qwen 3 8B
+      mad_tag: pyt_train_qwen3-8b
+      model_repo: Qwen3-8B
+      url: https://huggingface.co/Qwen/Qwen3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 3 32B
+      mad_tag: pyt_train_qwen3-32b
+      model_repo: Qwen3-32
+      url: https://huggingface.co/Qwen/Qwen3-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 32B
+      mad_tag: pyt_train_qwen2.5-32b
+      model_repo: Qwen2.5-32B
+      url: https://huggingface.co/Qwen/Qwen2.5-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 72B
+      mad_tag: pyt_train_qwen2.5-72b
+      model_repo: Qwen2.5-72B
+      url: https://huggingface.co/Qwen/Qwen2.5-72B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2 1.5B
+      mad_tag: pyt_train_qwen2-1.5b
+      model_repo: Qwen2-1.5B
+      url: https://huggingface.co/Qwen/Qwen2-1.5B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 2 7B
+      mad_tag: pyt_train_qwen2-7b
+      model_repo: Qwen2-7B
+      url: https://huggingface.co/Qwen/Qwen2-7B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+  - group: Flux
+    tag: flux
+    models:
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
--- a/docs/data/reference/gpu-atomics-operation/cas-atomics_nopcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/cas-atomics_nopcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS
-32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native,⚠️ Scope Downgrade - CAS
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native,⚠️ Scope Downgrade - CAS
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ CAS,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicSub,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicInc,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicDec,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atoimcExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ Native,⚠️ Scope Downgrade - CAS
+32 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicExch,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicCAS,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicAnd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicOr,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit atomicXor,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
--- a/docs/data/reference/gpu-atomics-operation/cas-atomics_pcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/cas-atomics_pcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicInc,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicDec,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 half2 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atoimcExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicExch,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicOr,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit atomicXor,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
--- a/docs/data/reference/gpu-atomics-operation/hw-atomics_nopcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/hw-atomics_nopcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
-32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS
-64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native
-32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicAdd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit float atomicMin,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+32 bit float atomicMax,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade - CAS,✅ CAS,⚠️ Scope Downgrade - CAS
+64 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit float atomicMin,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit float atomicMax,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+16bx2 bfloat162 atomicAdd,❌ NOP,❌ NOP,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atoimcExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicExch,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicCAS,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
--- a/docs/data/reference/gpu-atomics-operation/hw-atomics_pcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/hw-atomics_pcie_instinct.csv
@@ -1,325 +1,325 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X,MI300A
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
-32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
-64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native
-16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native
-32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
-64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
-64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,✅ NoReturn,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ Native,✅ Native,✅ Native
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicSub,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicInc,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicDec,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicMin,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicMax,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit float atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit float atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+32 bit float atomicMax,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
+64 bit float atomicAdd,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit float atomicMin,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit float atomicMax,✅ CAS,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+16bx2 half2 atomicAdd,❌ NOP,❌ NOP,❌ NOP,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+16bx2 bfloat162 atomicAdd,✅ CAS,✅ CAS,✅ CAS,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atoimcExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+32 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+32 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicExch,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicCAS,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
+64 bit atomicAnd,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicOr,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
+64 bit atomicXor,❌ NOP,❌ NOP,✅ Native,⚠️ Scope Downgrade,✅ Native,⚠️ Scope Downgrade
--- a/docs/data/reference/precision-support/precision-support.yaml
+++ b/docs/data/reference/precision-support/precision-support.yaml
@@ -0,0 +1,391 @@
+# rocm-library-support.yaml
+library_groups:
+  - group: "ML & Computer Vision"
+    tag: "ml-cv"
+    libraries:
+      - name: "Composable Kernel"
+        tag: "composable-kernel"
+        doc_link: "composable_kernel:reference/Composable_Kernel_supported_scalar_types"
+        data_types:
+          - type: "int8"
+            support: "✅"
+          - type: "int32"
+            support: "✅"
+          - type: "float4"
+            support: "✅"
+          - type: "float6 (E2M3)"
+            support: "✅"
+          - type: "float6 (E3M2)"
+            support: "✅"
+          - type: "float8 (E4M3)"
+            support: "✅"
+          - type: "float8 (E5M2)"
+            support: "✅"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "MIGraphX"
+        tag: "migraphx"
+        doc_link: "amdmigraphx:reference/cpp"
+        data_types:
+          - type: "int8"
+            support: "⚠️"
+          - type: "int16"
+            support: "✅"
+          - type: "int32"
+            support: "✅"
+          - type: "int64"
+            support: "✅"
+          - type: "float8 (E4M3)"
+            support: "✅"
+          - type: "float8 (E5M2)"
+            support: "✅"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "MIOpen"
+        tag: "miopen"
+        doc_link: "miopen:reference/datatypes"
+        data_types:
+          - type: "int8"
+            support: "⚠️"
+          - type: "int32"
+            support: "⚠️"
+          - type: "float8 (E4M3)"
+            support: "⚠️"
+          - type: "float8 (E5M2)"
+            support: "⚠️"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "⚠️"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "⚠️"
+
+  - group: "Communication"
+    tag: "communication"
+    libraries:
+      - name: "RCCL"
+        tag: "rccl"
+        doc_link: "rccl:api-reference/library-specification"
+        data_types:
+          - type: "int8"
+            support: "✅"
+          - type: "int32"
+            support: "✅"
+          - type: "int64"
+            support: "✅"
+          - type: "float8 (E4M3)"
+            support: "✅"
+          - type: "float8 (E5M2)"
+            support: "✅"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+  - group: "Math Libraries"
+    tag: "math-libs"
+    libraries:
+      - name: "hipBLAS"
+        tag: "hipblas"
+        doc_link: "hipblas:reference/data-type-support"
+        data_types:
+          - type: "float16"
+            support: "⚠️"
+          - type: "bfloat16"
+            support: "⚠️"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "hipBLASLt"
+        tag: "hipblaslt"
+        doc_link: "hipblaslt:reference/data-type-support"
+        data_types:
+          - type: "int8"
+            support: "✅"
+          - type: "float4"
+            support: "✅"
+          - type: "float6 (E2M3)"
+            support: "✅"
+          - type: "float6 (E3M2)"
+            support: "✅"
+          - type: "float8 (E4M3)"
+            support: "✅"
+          - type: "float8 (E5M2)"
+            support: "✅"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+
+      - name: "hipFFT"
+        tag: "hipfft"
+        doc_link: "hipfft:reference/fft-api-usage"
+        data_types:
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "hipRAND"
+        tag: "hiprand"
+        doc_link: "hiprand:api-reference/data-type-support"
+        data_types:
+          - type: "int8"
+            support: "Output only"
+          - type: "int16"
+            support: "Output only"
+          - type: "int32"
+            support: "Output only"
+          - type: "int64"
+            support: "Output only"
+          - type: "float16"
+            support: "Output only"
+          - type: "float32"
+            support: "Output only"
+          - type: "float64"
+            support: "Output only"
+
+      - name: "hipSOLVER"
+        tag: "hipsolver"
+        doc_link: "hipsolver:reference/precision"
+        data_types:
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "hipSPARSE"
+        tag: "hipsparse"
+        doc_link: "hipsparse:reference/precision"
+        data_types:
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "hipSPARSELt"
+        tag: "hipsparselt"
+        doc_link: "hipsparselt:reference/data-type-support"
+        data_types:
+          - type: "int8"
+            support: "✅"
+          - type: "float8 (E4M3)"
+            support: "✅"
+          - type: "float8 (E5M2)"
+            support: "✅"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+
+      - name: "rocBLAS"
+        tag: "rocblas"
+        doc_link: "rocblas:reference/data-type-support"
+        data_types:
+          - type: "float16"
+            support: "⚠️"
+          - type: "bfloat16"
+            support: "⚠️"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "rocFFT"
+        tag: "rocfft"
+        doc_link: "rocfft:reference/api"
+        data_types:
+          - type: "float16"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "rocRAND"
+        tag: "rocrand"
+        doc_link: "rocrand:api-reference/data-type-support"
+        data_types:
+          - type: "int8"
+            support: "Output only"
+          - type: "int16"
+            support: "Output only"
+          - type: "int32"
+            support: "Output only"
+          - type: "int64"
+            support: "Output only"
+          - type: "float16"
+            support: "Output only"
+          - type: "float32"
+            support: "Output only"
+          - type: "float64"
+            support: "Output only"
+
+      - name: "rocSOLVER"
+        tag: "rocsolver"
+        doc_link: "rocsolver:reference/precision"
+        data_types:
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "rocSPARSE"
+        tag: "rocsparse"
+        doc_link: "rocsparse:reference/precision"
+        data_types:
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "rocWMMA"
+        tag: "rocwmma"
+        doc_link: "rocwmma:api-reference/api-reference-guide"
+        data_types:
+          - type: "int8"
+            support: "✅"
+          - type: "int32"
+            support: "Output only"
+          - type: "float8 (E4M3)"
+            support: "Input only"
+          - type: "float8 (E5M2)"
+            support: "Input only"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "tensorfloat32"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "Tensile"
+        tag: "tensile"
+        doc_link: "tensile:reference/precision-support"
+        data_types:
+          - type: "int8"
+            support: "✅"
+          - type: "int32"
+            support: "✅"
+          - type: "float8 (E4M3)"
+            support: "✅"
+          - type: "float8 (E5M2)"
+            support: "✅"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "tensorfloat32"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+  - group: "Primitives"
+    tag: "primitives"
+    libraries:
+      - name: "hipCUB"
+        tag: "hipcub"
+        doc_link: "hipcub:api-reference/data-type-support"
+        data_types:
+          - type: "int8"
+            support: "✅"
+          - type: "int16"
+            support: "✅"
+          - type: "int32"
+            support: "✅"
+          - type: "int64"
+            support: "✅"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "hipTensor"
+        tag: "hiptensor"
+        doc_link: "hiptensor:api-reference/api-reference"
+        data_types:
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "rocPRIM"
+        tag: "rocprim"
+        doc_link: "rocprim:reference/data-type-support"
+        data_types:
+          - type: "int8"
+            support: "✅"
+          - type: "int16"
+            support: "✅"
+          - type: "int32"
+            support: "✅"
+          - type: "int64"
+            support: "✅"
+          - type: "float16"
+            support: "✅"
+          - type: "bfloat16"
+            support: "✅"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
+
+      - name: "rocThrust"
+        tag: "rocthrust"
+        doc_link: "rocthrust:data-type-support"
+        data_types:
+          - type: "int8"
+            support: "✅"
+          - type: "int16"
+            support: "✅"
+          - type: "int32"
+            support: "✅"
+          - type: "int64"
+            support: "✅"
+          - type: "float16"
+            support: "⚠️"
+          - type: "bfloat16"
+            support: "⚠️"
+          - type: "float32"
+            support: "✅"
+          - type: "float64"
+            support: "✅"
--- a/docs/data/rocm-software-stack-7_0_0.jpg
+++ b/docs/data/rocm-software-stack-7_0_0.jpg
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -23,93 +23,114 @@ The table below summarizes information about ROCm-enabled deep learning framewor
      - Installation options
      - GitHub

-    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`_
+    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`_ 
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`_
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`_ 
-        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
+        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`_
+
+    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`_
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__

      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a> 

-    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`_
+    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`_
+
+    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>

-    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`_
+    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`_
+
+    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 

-    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`_
+    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`_
+
+    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`_ 
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__

      - .. raw:: html
-         
-          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>      

+          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>

 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.
@@ -124,10 +145,3 @@ through the following guides.

 * :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`

-
-
-
-
-
-
-
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -939,7 +939,7 @@ hipBLASLt benchmarking
 The GEMM library
 `hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
 provides a benchmark tool for its supported operations. Refer to the
-`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md>`_
+`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/bench/README.md>`_
 for details.

 * Example 1: Benchmark mix fp8 GEMM
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
@@ -0,0 +1,445 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker-812:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `vLLM <https://docs.vllm.ai/en/latest>`__
+        - {{ unified_docker.vllm_version }}
+
+      * - `PyTorch <https://github.com/ROCm/pytorch>`__
+        - {{ unified_docker.pytorch_version }}
+
+      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+        - {{ unified_docker.hipblaslt_version }}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements-812>` for
+MI300X series accelerators.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
+
+* Upgraded to vLLM v0.10.
+
+* FP8 KV cache support via AITER.
+
+* Full graph capture support via AITER.
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   .. _vllm-benchmark-available-models-812:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+      <div class="row">
+         <div class="col-2 me-2 model-param-head">Model group</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+         </div>
+      </div>
+
+      <div class="row mt-1">
+         <div class="col-2 me-2 model-param-head">Model</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+         </div>
+      </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-812:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+.. _vllm-benchmark-performance-measurements-812:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and serving measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-812:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+            Although the :ref:`available models
+            <vllm-benchmark-available-models-812>` are preconfigured to collect
+            offline throughput and online serving performance data, you can
+            also change the benchmarking parameters. See the standalone
+            benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
+               the ``--tunableop on`` argument in your run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
+               performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            1. Run the vLLM benchmark tool independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`_
+               as shown in the following snippet.
+
+               .. code-block:: shell
+
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}
+
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/vllm
+
+            3. To start the benchmark, use the following command with the appropriate options.
+
+               .. code-block::
+
+                  ./run.sh \
+                      --config $CONFIG_CSV \
+                      --model_repo {{ model.model_repo }} \
+                      <overrides>
+
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``--config``
+                       - ``configs/default.csv``
+                       - Run configs from the CSV for the chosen model repo and benchmark.
+
+                     * -
+                       - ``configs/extended.csv``
+                       - 
+
+                     * -
+                       - ``configs/performance.csv``
+                       - 
+
+                     * - ``--benchmark``
+                       - ``throughput``
+                       - Measure offline end-to-end throughput.
+
+                     * - 
+                       - ``serving``
+                       - Measure online serving performance.
+
+                     * - 
+                       - ``all``
+                       - Measure both throughput and serving.
+
+                     * - `<overrides>`
+                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
+                       - Additional overrides to the config CSV.
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               .. note::
+
+                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+
+                  If you encounter the following error, pass your access-authorized Hugging
+                  Face token to the gated models.
+
+                  .. code-block::
+
+                     OSError: You are trying to access a gated repo.
+
+                     # pass your HF_TOKEN
+                     export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark throughput
+
+              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
+
+            * Serving benchmark
+
+              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark serving
+
+              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm/vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/vllm.git
+
+2. Checkout the specific release commit.
+
+   .. code-block:: shell
+
+      cd vllm
+      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
+
+3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
+
+   .. code-block:: shell
+
+      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -16,7 +16,7 @@ vLLM inference performance testing

 .. _vllm-benchmark-unified-docker-715:

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml

   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -46,7 +46,7 @@ vLLM inference performance testing
        - {{ unified_docker.hipblaslt_version }}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements>` for
+inference performance numbers <vllm-benchmark-performance-measurements-715>` for
 MI300X series accelerators.

 What's new
@@ -69,7 +69,7 @@ The following is summary of notable changes since the :doc:`previous ROCm/vLLM D
 Supported models
 ================

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml

   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -162,7 +162,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml

   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -219,7 +219,7 @@ system's configuration.
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.

-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
            to collect latency and throughput performance data, you can also change the benchmarking
            parameters. See the standalone benchmarking tab for more information.

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -7,7 +7,7 @@ vLLM inference performance testing version history
 This table lists previous versions of the ROCm vLLM inference Docker image for
 inference performance testing. For detailed information about available models
 for benchmarking, see the version-specific documentation. You can find tagged
-previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`__.
+previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__.

 .. list-table::
   :header-rows: 1
@@ -16,103 +16,112 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Components
     - Resources

-   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909``
       (latest)
-     - 
+     -
+       * ROCm 6.4.1
+       * vLLM 0.10.1
+       * PyTorch 2.7.0
+     -
+       * :doc:`Documentation <../vllm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__
+
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
+     -
       * ROCm 6.4.1
       * vLLM 0.10.0
       * PyTorch 2.7.0
-     - 
-       * :doc:`Documentation <../vllm>`
+     -
+       * :doc:`Documentation <vllm-0.10.0-20250812>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
-     - 
+     -
       * ROCm 6.4.1
       * vLLM 0.9.1
       * PyTorch 2.7.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.9.1-20250715>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
-     - 
+     -
       * ROCm 6.4.1
       * vLLM 0.9.1
       * PyTorch 2.7.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.9.1-20250702>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605``
-     - 
+     -
       * ROCm 6.4.1
       * vLLM 0.9.0.1
       * PyTorch 2.7.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.9.0.1-20250605>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c>`__

   * - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250521``
-     - 
+     -
       * ROCm 6.3.1
       * 0.8.5 vLLM (0.8.6.dev)
       * PyTorch 2.7.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.8.5-20250521>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`__

   * - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513``
-     - 
+     -
       * ROCm 6.3.1
       * vLLM 0.8.5
       * PyTorch 2.7.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.8.5-20250513>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`__

   * - ``rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415``
-     - 
+     -
       * ROCm 6.3.1
       * vLLM 0.8.3
       * PyTorch 2.7.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.8.3-20250415>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`__

   * - ``rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325``
-     - 
+     -
       * ROCm 6.3.1
       * vLLM 0.7.3
       * PyTorch 2.7.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.7.3-20250325>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`__

   * - ``rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6``
-     - 
+     -
       * ROCm 6.3.1
       * vLLM 0.6.6
       * PyTorch 2.7.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.6.6>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`__

   * - ``rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4``
-     - 
+     -
       * ROCm 6.2.1
       * vLLM 0.6.4
       * PyTorch 2.5.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.6.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`__

   * - ``rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50``
-     - 
+     -
       * ROCm 6.2.0
       * vLLM 0.4.3
       * PyTorch 2.4.0
-     - 
+     -
       * :doc:`Documentation <vllm-0.4.3>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`__

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -31,26 +31,30 @@ PyTorch inference performance testing
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1" style="display: none;">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-            <div class="col-12 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1" style="display: none;">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
      </div>

   {% for model_group in model_groups %}
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
@@ -2,19 +2,19 @@
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
   :keywords: model, MAD, automation, dashboarding, validate

-************************************
-SGLang inference performance testing
-************************************
+*****************************************************************
+SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
+*****************************************************************

 .. _sglang-benchmark-unified-docker:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml

-   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
+   {% set docker = data.dockers[0] %}

   `SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
   serving engine for large language models (LLMs) and vision models. The
-   ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
+   ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__
   bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
   accelerators. It includes the following software components:

@@ -24,14 +24,10 @@ SGLang inference performance testing
      * - Software component
        - Version

-      * - `ROCm <https://github.com/ROCm/ROCm>`__
-        - {{ unified_docker.rocm_version }}
-
-      * - `SGLang <https://docs.sglang.ai/index.html>`__
-        - {{ unified_docker.sglang_version }} 
-
-      * - `PyTorch <https://github.com/pytorch/pytorch>`__
-        - {{ unified_docker.pytorch_version }} 
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

 System validation
 =================
@@ -50,8 +46,8 @@ system's configuration.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml

-   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
-   {% set model_groups = data.sglang_benchmark.model_groups %}
+   {% set unified_docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}

   Pull the Docker image
   =====================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -1,20 +1,18 @@
 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
-                 ROCm vLLM Docker image.
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

 **********************************
 vLLM inference performance testing
 **********************************

-.. _vllm-benchmark-unified-docker-812:
+.. _vllm-benchmark-unified-docker-909:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   {% set docker = data.dockers[0] %}

-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
@@ -26,20 +24,13 @@ vLLM inference performance testing
      * - Software component
        - Version

-      * - `ROCm <https://github.com/ROCm/ROCm>`__
-        - {{ unified_docker.rocm_version }}
-
-      * - `vLLM <https://docs.vllm.ai/en/latest>`__
-        - {{ unified_docker.vllm_version }}
-
-      * - `PyTorch <https://github.com/ROCm/pytorch>`__
-        - {{ unified_docker.pytorch_version }}
-
-      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
-        - {{ unified_docker.hipblaslt_version }}
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements>` for
+inference performance numbers <vllm-benchmark-performance-measurements-909>` for
 MI300X series accelerators.

 What's new
@@ -47,21 +38,23 @@ What's new

 The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.

-* Upgraded to vLLM v0.10.
+* Upgraded to vLLM v0.10.1.

-* FP8 KV cache support via AITER.
+* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.

-* Full graph capture support via AITER.
+* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.
+
+.. _vllm-benchmark-supported-models-909:

 Supported models
 ================

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}

-   .. _vllm-benchmark-available-models-812:
+   .. _vllm-benchmark-available-models-909:

   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
@@ -70,55 +63,51 @@ Supported models
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-      <div class="row">
-         <div class="col-2 me-2 model-param-head">Model group</div>
-         <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-         </div>
-      </div>
-
-      <div class="row mt-1">
-         <div class="col-2 me-2 model-param-head">Model</div>
-         <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
-   {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
         </div>
      </div>
-      </div>

-   .. _vllm-benchmark-vllm-812:
+   .. _vllm-benchmark-vllm-909:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}

-   .. container:: model-doc {{model.mad_tag}}
+   .. container:: model-doc {{ model.mad_tag }}

      .. note::

         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
         Some models require access authorization prior to use via an external license agreement through a third party.
+      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
+         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
+      {% endif %}

      {% endfor %}
   {% endfor %}

-.. note::
-
-   vLLM is a toolkit and library for LLM inference and serving. AMD implements
-   high-performance custom kernels and modules in vLLM to enhance performance.
-   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-   more information.
-
-.. _vllm-benchmark-performance-measurements-812:
+.. _vllm-benchmark-performance-measurements-909:

 Performance measurements
 ========================
@@ -151,18 +140,18 @@ system's configuration.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}

   Pull the Docker image
   =====================

-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
   Use the following command to pull the Docker image from Docker Hub.

   .. code-block:: shell

-      docker pull {{ unified_docker.pull_tag }}
+      docker pull {{ docker.pull_tag }}

   Benchmarking
   ============
@@ -170,7 +159,7 @@ system's configuration.
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad-812:
+   .. _vllm-benchmark-mad-909:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -181,6 +170,9 @@ system's configuration.

         .. tab-item:: MAD-integrated benchmarking

+            The following run command is tailored to {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
+
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.

@@ -208,7 +200,7 @@ system's configuration.
            and ``{{ model.mad_tag }}_serving.csv``.

            Although the :ref:`available models
-            <vllm-benchmark-available-models>` are preconfigured to collect
+            <vllm-benchmark-available-models-909>` are preconfigured to collect
            offline throughput and online serving performance data, you can
            also change the benchmarking parameters. See the standalone
            benchmarking tab for more information.
@@ -232,132 +224,143 @@ system's configuration.

         .. tab-item:: Standalone benchmarking

-            .. rubric:: Download the Docker image and required scripts
+            The following commands are optimized for {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.

-            1. Run the vLLM benchmark tool independently by starting the
-               `Docker container <{{ unified_docker.docker_hub_url }}>`_
-               as shown in the following snippet.
+            .. seealso::
+
+               For more information on configuration, see the `config files
+               <https://github.com/ROCm/MAD-private/tree/develop/scripts/vllm/configs>`__
+               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
+               for descriptions of available configuration options
+               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
+               additional benchmarking information.
+
+            .. rubric:: Launch the container
+
+            You can run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
+            in the following snippet.
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+               docker run -it \
+                   --device=/dev/kfd \
+                   --device=/dev/dri \
+                   --group-add video \
+                   --shm-size 16G \
+                   --security-opt seccomp=unconfined \
+                   --security-opt apparmor=unconfined \
+                   --cap-add=SYS_PTRACE \
+                   -v $(pwd):/workspace \
+                   --env HUGGINGFACE_HUB_CACHE=/workspace \
+                   --name test \
+                   {{ docker.pull_tag }}
+
+            .. rubric:: Throughput command
+
+            Use the following command to start the throughput benchmark.
+
+            .. code-block:: shell
+
+               model={{ model.model_repo }}
+               tp={{ model.config.tp }}
+               num_prompts=1024
+               in=128
+               out=128
+               dtype={{ model.config.dtype }}
+               kv_cache_dtype={{ model.config.kv_cache_dtype }}
+               max_num_seqs=1024
+               max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
+               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+               max_model_len={{ model.config.max_model_len }}
+
+               vllm bench throughput --model $model \
+                   -tp $tp \
+                   --num-prompts $num_prompts \
+                   --input-len $in \
+                   --output-len $out \
+                   --dtype $dtype \
+                   --kv-cache-dtype $kv_cache_dtype \
+                   --max-num-seqs $max_num_seqs \
+                   --max-seq-len-to-capture $max_seq_len_to_capture \
+                   --max-num-batched-tokens $max_num_batched_tokens \
+                   --max-model-len $max_model_len \
+                   --trust-remote-code \
+                   --output-json ${model}_throughput.json \
+                   --gpu-memory-utilization 0.9
+
+            .. rubric:: Serving command
+
+            1. Start the server using the following command:

               .. code-block:: shell

-                  docker pull {{ unified_docker.pull_tag }}
-                  docker run -it \
-                      --device=/dev/kfd \
-                      --device=/dev/dri \
-                      --group-add video \
-                      --shm-size 16G \
-                      --security-opt seccomp=unconfined \
-                      --security-opt apparmor=unconfined \
-                      --cap-add=SYS_PTRACE \
-                      -v $(pwd):/workspace \
-                      --env HUGGINGFACE_HUB_CACHE=/workspace \
-                      --name test \
-                      {{ unified_docker.pull_tag }}
+                  model={{ model.model_repo }}
+                  tp={{ model.config.tp }}
+                  dtype={{ model.config.dtype }}
+                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
+                  max_num_seqs=256
+                  max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
+                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+                  max_model_len={{ model.config.max_model_len }}

-            2. In the Docker container, clone the ROCm MAD repository and navigate to the
-               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+                  vllm serve $model \
+                      -tp $tp \
+                      --dtype $dtype \
+                      --kv-cache-dtype $kv_cache_dtype \
+                      --max-num-seqs $max_num_seqs \
+                      --max-seq-len-to-capture $max_seq_len_to_capture \
+                      --max-num-batched-tokens $max_num_batched_tokens \
+                      --max-model-len $max_model_len \
+                      --no-enable-prefix-caching \
+                      --swap-space 16 \
+                      --disable-log-requests \
+                      --trust-remote-code \
+                      --gpu-memory-utilization 0.9
+
+               Wait until the model has loaded and the server is ready to accept requests.
+
+            2. On another terminal on the same machine, run the benchmark:

               .. code-block:: shell

-                  git clone https://github.com/ROCm/MAD
-                  cd MAD/scripts/vllm
+                  # Connect to the container
+                  docker exec -it test bash

-            3. To start the benchmark, use the following command with the appropriate options.
+                  # Wait for the server to start
+                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
+
+                  # Run the benchmark
+                  model={{ model.model_repo }}
+                  max_concurrency=1
+                  num_prompts=10
+                  in=128
+                  out=128
+                  vllm bench serve --model $model \
+                      --percentile-metrics "ttft,tpot,itl,e2el" \
+                      --dataset-name random \
+                      --ignore-eos \
+                      --max-concurrency $max_concurrency \
+                      --num-prompts $num_prompts \
+                      --random-input-len $in \
+                      --random-output-len $out \
+                      --trust-remote-code \
+                      --save-result \
+                      --result-filename ${model}_serving.json
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.

               .. code-block::

-                  ./run.sh \
-                      --config $CONFIG_CSV \
-                      --model_repo {{ model.model_repo }} \
-                      <overrides>
+                  OSError: You are trying to access a gated repo.

-               .. dropdown:: Benchmark options
-                  :open:
-
-                  .. list-table::
-                     :header-rows: 1
-                     :align: center
-
-                     * - Name
-                       - Options
-                       - Description
-
-                     * - ``--config``
-                       - ``configs/default.csv``
-                       - Run configs from the CSV for the chosen model repo and benchmark.
-
-                     * -
-                       - ``configs/extended.csv``
-                       - 
-
-                     * -
-                       - ``configs/performance.csv``
-                       - 
-
-                     * - ``--benchmark``
-                       - ``throughput``
-                       - Measure offline end-to-end throughput.
-
-                     * - 
-                       - ``serving``
-                       - Measure online serving performance.
-
-                     * - 
-                       - ``all``
-                       - Measure both throughput and serving.
-
-                     * - `<overrides>`
-                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
-                       - Additional overrides to the config CSV.
-
-                  The input sequence length, output sequence length, and tensor parallel (TP) are
-                  already configured. You don't need to specify them with this script.
-
-               .. note::
-
-                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
-
-                  If you encounter the following error, pass your access-authorized Hugging
-                  Face token to the gated models.
-
-                  .. code-block::
-
-                     OSError: You are trying to access a gated repo.
-
-                     # pass your HF_TOKEN
-                     export HF_TOKEN=$your_personal_hf_token
-
-            .. rubric:: Benchmarking examples
-
-            Here are some examples of running the benchmark with various options:
-
-            * Throughput benchmark
-
-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block:: shell
-
-                 export MAD_MODEL_NAME={{ model.mad_tag }}
-                 ./run.sh \
-                     --config configs/default.csv \
-                     --model_repo {{model.model_repo}} \
-                     --benchmark throughput
-
-              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
-
-            * Serving benchmark
-
-              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block::
-
-                 export MAD_MODEL_NAME={{ model.mad_tag }}
-                 ./run.sh \
-                     --config configs/default.csv \
-                     --model_repo {{model.model_repo}} \
-                     --benchmark serving
-
-              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token

            .. raw:: html

@@ -382,7 +385,7 @@ Advanced usage
 ==============

 For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.

 Reproducing the Docker image
 ----------------------------
@@ -400,7 +403,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
   .. code-block:: shell

      cd vllm
-      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
+      git checkout 6663000a391911eba96d7864a26ac42b07f6ef29

 3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.

@@ -419,15 +422,12 @@ Further reading
 - To learn more about system settings and management practices to configure your system for
  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

+- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+  a brief introduction to vLLM and optimization strategies.
+
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
-
- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
-
 - For a list of other ready-made Docker images for AI with ROCm, see
  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -22,9 +22,9 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
 <rocm-install-on-linux:install/quick-start>`.

 If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
-`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.
+`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.

-You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
+You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
 distribution's package manager. See the following documentation resources to get started:

 * :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -2,9 +2,9 @@
   :description: How to train a model using JAX MaxText for ROCm.
   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker

-**************************************
-Training a model with MaxText for ROCm
-**************************************
+******************************************
+Training a model with JAX MaxText for ROCm
+******************************************

 MaxText is a high-performance, open-source framework built on the Google JAX
 machine learning library to train LLMs at scale. The MaxText framework for
@@ -12,70 +12,108 @@ ROCm is an optimized fork of the upstream
 `<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
 on AMD MI300X series accelerators.

-The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
+The MaxText for ROCm training Docker image
 provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:

-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| JAX                      | 0.4.35                         |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.12                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.13.0-ae9c477a                |
-+--------------------------+--------------------------------+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml

-Supported features and models
-=============================
+   {% set dockers = data.dockers %}
+   .. tab-set::

-MaxText provides the following key features to train large language models efficiently:
+      {% for docker in dockers %}
+      {% set jax_version = docker.components["JAX"] %}
+
+      .. tab-item:: JAX {{ jax_version }}
+         :sync: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+
+            {% endfor %}
+         {% if jax_version == "0.6.0" %}
+         .. note::
+
+            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
+            not configured correctly. For now you can turn it off by setting
+            ``shardy=False`` during the training run. You can also follow the `migration
+            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
+            it.
+
+            The provided multi-node training scripts in this documentation are
+            not currently supported with JAX 0.6.0. For multi-node training, use the JAX 0.5.0
+            Docker image.
+         {% endif %}
+
+      {% endfor %}
+
+MaxText with on ROCm provides the following key features to train large language models efficiently:

 - Transformer Engine (TE)

- Flash Attention (FA) 3
+- Flash Attention (FA) 3 -- with or without sequence input packing

 - GEMM tuning

 - Multi-node support

-.. _amd-maxtext-model-support:
+- NANOO FP8 quantization support

-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+.. _amd-maxtext-model-support-v257:

-* Llama 3.3 70B
+Supported models
+================

-* Llama 3.1 8B
+The following models are pre-optimized for performance on AMD Instinct MI300
+series accelerators. Some instructions, commands, and available training
+configurations in this documentation might vary by model -- select one to get
+started.

-* Llama 3.1 70B
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml

-* Llama 3 8B
+   {% set model_groups = data.model_groups %}
+   .. raw:: html

-* Llama 3 70B
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>

-* Llama 2 7B
-
-* Llama 2 70B
-
-* DeepSeek-V2-Lite
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>

 .. note::

   Some models, such as Llama 3, require an external license agreement through
   a third party (for example, Meta).

-Unsupported features
--------------------
-
-Currently, MaxText's default packed input format is not supported. Using this format
-with the current Docker image results in incorrect attention calculations
-across different input sequences. Support for packed input format is planned for a future release.
-
 System validation
 =================

@@ -98,14 +136,14 @@ This Docker image is optimized for specific model configurations outlined
 as follows. Performance can vary for other training workloads, as AMD
 doesn’t validate configurations and run conditions outside those described.

-.. _amd-maxtext-multi-node-setup:
+.. _amd-maxtext-multi-node-setup-v257:

 Multi-node setup
 ----------------

 For multi-node environments, ensure you have all the necessary packages for
 your network device, such as, RDMA. If you're not using a multi-node setup
-with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.

 1. Install the following packages to build and install the RDMA driver.

@@ -170,7 +208,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.

   e. RDMA interface

-      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
      Then, set the RDMA interfaces to use for communication.

      .. code-block:: bash
@@ -180,196 +218,203 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
         # If using Mellanox NIC
         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9

-.. _amd-maxtext-download-docker:
+.. _amd-maxtext-get-started-v257:

-Pull the Docker image
---------------------
+Benchmarking
+============

-1. Use the following command to pull the Docker image from Docker Hub.
+Once the setup is complete, choose between two options to reproduce the
+benchmark results:

-   .. code-block:: shell
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml

-      docker pull rocm/jax-training:maxtext-v25.5
+   .. _vllm-benchmark-mad:

-2. Use the following command to launch the Docker container. Note that the benchmarking scripts
-   used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
-   and execute the benchmark.
+   {% set dockers = data.dockers %}
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}

-   .. code-block:: shell
+   .. container:: model-doc {{model.mad_tag}}

-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
+      .. tab-set::

-.. _amd-maxtext-get-started:
+         {% if model.mad_tag and "single-node" in model.doc_options %}
+         .. tab-item:: MAD-integrated benchmarking

-Getting started
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the {{ model.model }} model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv/``.
+         {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            Run the JAX MaxText benchmark tool independently by starting the
+            Docker container as shown in the following snippet.
+
+            .. tab-set::
+               {% for docker in dockers %}
+               {% set jax_version = docker.components["JAX"] %}
+
+               .. tab-item:: JAX {{ jax_version }}
+                  :sync: {{ docker.pull_tag }}
+
+                  .. code-block:: shell
+
+                     docker pull {{ docker.pull_tag }}
+               {% endfor %}
+
+            {% if model.model_repo and "single-node" in model.doc_options %}
+            .. rubric:: Single node training
+
+            1. Set up environment variables.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
+                  export HF_HOME=<Location of saved/cached Hugging Face models>
+
+               ``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
+               See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
+
+               ``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
+               If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
+               Downloaded files typically get cached to ``~/.cache/huggingface``.
+
+            2. Launch the Docker container.
+
+               .. tab-set::
+                  {% for docker in dockers %}
+                  {% set jax_version = docker.components["JAX"] %}
+
+                  .. tab-item:: JAX {{ jax_version }}
+                     :sync: {{ docker.pull_tag }}
+
+                     .. code-block:: shell
+
+                        docker run -it \
+                            --device=/dev/dri \
+                            --device=/dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add=SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            -v $HF_HOME:/hf_cache \
+                            -e HF_HOME=/hf_cache \
+                            -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}
+
+            3. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/jax-maxtext
+
+            4. Run the setup scripts to install libraries and datasets needed
+               for benchmarking.
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
+
+            5. To run the training benchmark without quantization, use the following command:
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
+
+               For quantized training, use the following command:
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
+
+               .. important::
+
+                  Quantized training is not supported with the JAX 0.6.0 Docker image; support
+                  will be added in a future release. For quantized training, use the JAX 0.5.0
+                  Docker image: ``rocm/jax-training:maxtext-v25.7``.
+
+            {% endif %}
+            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
+            .. rubric:: Multi-node training
+
+            The following examples use SLURM to run on multiple nodes.
+
+            .. note::
+
+               The following scripts will launch the Docker container and run the
+               benchmark. Run them outside of any Docker container.
+
+            1. Make sure ``$HF_HOME`` is set before running the test. See
+               `ROCm benchmarking <https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__
+               for more details on downloading the Llama models before running the
+               benchmark.
+
+            2. To run multi-node training for {{ model.model }}, 
+               use the
+               `multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
+               under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
+
+            3. Run the multi-node training benchmark script.
+
+               .. code-block:: shell
+
+                  sbatch -N <num_nodes> {{ model.multinode_training_script }}
+
+         {% else %}
+            .. rubric:: Multi-node training
+
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
+            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+Further reading
 ===============

-The following examples demonstrate how to get started with single node
-and multi-node training using the benchmarking scripts provided at
-`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+- See the ROCm/maxtext benchmarking README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__.

-.. important::
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

-   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

-Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
-set correctly and points to your Hugging Face cache directory. Refer to the
-README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
-for more detailed instructions.
-
-Single node training benchmarking examples
------------------------------------------
-
-* Example 1: Single node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
-
-* Example 2: Single node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
-
-* Example 3: Single node training with Llama 3 8B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
-
-* Example 4: Single node training with Llama 3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
-
-* Example 5: Single node training with Llama 3.3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
-
-* Example 6: Single node training with DeepSeek V2 16B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
-
-  .. note::
-
-     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
-     the tokens/s as a performance indicator.
-
-Multi-node training benchmarking examples
-----------------------------------------
-
-The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
-own cluster setup.
-
-* Example 1: Multi-node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_7b_multinode.sh
-
-* Example 2: Multi-node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_70b_multinode.sh
-
-* Example 3: Multi-node training with Llama 3 8B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_8b_multinode.sh
-
-* Example 4: Multi-node training with Llama 3 70B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_70b_multinode.sh
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -70,32 +70,32 @@ workloads:
   {% set model_groups = data.model_groups %}
   .. raw:: html

-         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-           <div class="row">
-             <div class="col-2 me-2 model-param-head">Model</div>
-             <div class="row col-10">
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
-               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
-             </div>
-           </div>
+            </div>
+         </div>

-           <div class="row mt-1">
-             <div class="col-2 me-2 model-param-head">Model variant</div>
-             <div class="row col-10">
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
            {% if models|length % 3 == 0 %}
-               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% else %}
-               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% endif %}
         {% endfor %}
      {% endfor %}
-             </div>
-           </div>
+            </div>
         </div>
+      </div>

 .. note::

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -17,12 +17,21 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
     - Components
     - Resources

-   * - 25.5 (latest)
+   * - 25.7 (latest)
+     - 
+       * ROCm 6.4.1
+       * JAX 0.6.0, 0.5.0
+     - 
+       * :doc:`Documentation <../jax-maxtext>`
+       * `Docker Hub (JAX 0.6.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
+       * `Docker Hub (JAX 0.5.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025>`__
+
+   * - 25.5
     - 
       * ROCm 6.3.4
       * JAX 0.4.35
     - 
-       * :doc:`Documentation <../jax-maxtext>`
+       * :doc:`Documentation <jax-maxtext-v25.5>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__

   * - 25.4
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
@@ -51,7 +51,7 @@ MaxText provides the following key features to train large language models effic

 - Multi-node support

-.. _amd-maxtext-model-support:
+.. _amd-maxtext-model-support-v254:

 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
@@ -0,0 +1,385 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using JAX MaxText for ROCm.
+   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with MaxText for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm JAX MaxText
+   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
+
+MaxText is a high-performance, open-source framework built on the Google JAX
+machine learning library to train LLMs at scale. The MaxText framework for
+ROCm is an optimized fork of the upstream
+`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
+on AMD MI300X series accelerators.
+
+The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
+including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
+It includes the following software components:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| JAX                      | 0.4.35                         |
+--------------------------+--------------------------------+
+| Python                   | 3.10.12                        |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.13.0-ae9c477a                |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+MaxText provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- Flash Attention (FA) 3
+
+- GEMM tuning
+
+- Multi-node support
+
+.. _amd-maxtext-model-support-v255:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+* Llama 3.3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+Unsupported features
+--------------------
+
+Currently, MaxText's default packed input format is not supported. Using this format
+with the current Docker image results in incorrect attention calculations
+across different input sequences. Support for packed input format is planned for a future release.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+as follows. Performance can vary for other training workloads, as AMD
+doesn’t validate configurations and run conditions outside those described.
+
+.. _amd-maxtext-multi-node-setup-v255:
+
+Multi-node setup
+----------------
+
+For multi-node environments, ensure you have all the necessary packages for
+your network device, such as, RDMA. If you're not using a multi-node setup
+with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v255`.
+
+1. Install the following packages to build and install the RDMA driver.
+
+   .. code-block:: shell
+
+      sudo apt install iproute2 -y
+      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
+      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
+
+   Refer to your NIC manufacturer's documentation for further steps on
+   compiling and installing the RoCE driver. For example, for Broadcom,
+   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
+   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
+
+2. Set the following environment variables.
+
+   a. Master address
+
+      Change ``localhost`` to the master node's resolvable hostname or IP address:
+
+      .. code-block:: bash
+
+         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   b. Number of nodes
+
+      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+      .. code-block:: bash
+
+         export NNODES="${NNODES:-1}"
+
+   c. Node ranks
+
+      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
+      Node ranks should be unique across all nodes in the cluster.
+
+      .. code-block:: bash
+
+         export NODE_RANK="${NODE_RANK:-0}"
+
+   d. Network interface
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface with an IP address in the same subnet as
+      your other nodes. Then, update the following variable in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      This variable specifies which network interface to use for inter-node communication.
+      Setting this variable to the incorrect interface can result in communication failures
+      or significantly reduced performance.
+
+   e. RDMA interface
+
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v255>` are installed on all nodes.
+      Then, set the RDMA interfaces to use for communication.
+
+      .. code-block:: bash
+
+         # If using Broadcom NIC
+         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+         # If using Mellanox NIC
+         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. _amd-maxtext-download-docker-v255:
+
+Pull the Docker image
+---------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/jax-training:maxtext-v25.5
+
+2. Use the following command to launch the Docker container. Note that the benchmarking scripts
+   used in the :ref:`following section <amd-maxtext-get-started-v255>` automatically launch the Docker container
+   and execute the benchmark.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
+
+.. _amd-maxtext-get-started-v255:
+
+Getting started
+===============
+
+The following examples demonstrate how to get started with single node
+and multi-node training using the benchmarking scripts provided at
+`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+
+.. important::
+
+   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+
+Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
+set correctly and points to your Hugging Face cache directory. Refer to the
+README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
+for more detailed instructions.
+
+Single node training benchmarking examples
+------------------------------------------
+
+* Example 1: Single node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
+
+* Example 2: Single node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
+
+* Example 3: Single node training with Llama 3 8B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
+
+* Example 4: Single node training with Llama 3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
+
+* Example 5: Single node training with Llama 3.3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
+
+* Example 6: Single node training with DeepSeek V2 16B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
+
+  .. note::
+
+     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
+     the tokens/s as a performance indicator.
+
+Multi-node training benchmarking examples
+-----------------------------------------
+
+The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
+own cluster setup.
+
+* Example 1: Multi-node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_7b_multinode.sh
+
+* Example 2: Multi-node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_70b_multinode.sh
+
+* Example 3: Multi-node training with Llama 3 8B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_8b_multinode.sh
+
+* Example 4: Multi-node training with Llama 3 70B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_70b_multinode.sh
+
+Previous versions
+=================
+
+See :doc:`jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
 The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
 enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
 accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
-workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
+workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
 like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
 efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.

@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-24-12:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-3:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-4:

 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
      or the default ``HuggingFaceTokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - Components
     - Resources

+   * - v25.7
+     - 
+       * ROCm 6.4.2
+       * PyTorch 2.8.0a0+gitd06a406
+     - 
+       * :doc:`Documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
+
   * - v25.6
     - 
       * ROCm 6.3.4
       * PyTorch 2.8.0a0+git7d205b2
     - 
-       * :doc:`Documentation <../pytorch-training>`
+       * :doc:`Documentation <pytorch-training-v25.6>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__

   * - v25.5
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking:

           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B

+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
@@ -0,0 +1,456 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
+(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.8.0a0+git7d205b2             |
+--------------------------+--------------------------------+
+| Python                   | 3.10.17                        |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.14.0+2f85f5f2                |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0.post1                    |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.15.0-8c6919d                 |
+--------------------------+--------------------------------+
+| Triton                   | 3.3.0                          |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support-v256:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+
+   {% set unified_docker = data.unified_docker.latest %}
+   {% set model_groups = data.model_groups %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. note::
+
+      Some models require an external license agreement through a third party (for example, Meta).
+
+   .. _amd-pytorch-training-performance-measurements-v256:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   page provides reference throughput and latency measurements for training
+   popular AI models.
+
+   .. note::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+   before starting training.
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   This Docker image is optimized for specific model configurations outlined
+   below. Performance can vary for other training workloads, as AMD
+   doesn’t validate configurations and run conditions outside those described.
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to start benchmarking:
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+         directory and install the required packages on the host machine.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD
+            pip install -r requirements.txt
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            For example, use this command to run the performance benchmark test on the {{ model.model }} model
+            using one GPU with the {{ model.precision }} data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               madengine run \
+                   --tags {{ model.mad_tag }} \
+                   --keep-model-dir \
+                   --live-output \
+                   --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone benchmarking
+
+         .. rubric:: Download the Docker image and required packages
+
+         Use the following command to pull the Docker image from Docker Hub.
+
+         .. code-block:: shell
+
+            docker pull {{ unified_docker.pull_tag }}
+
+         Run the Docker container.
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+
+         Use these commands if you exit the ``training_env`` container and need to return to it.
+
+         .. code-block:: shell
+
+            docker start training_env
+            docker exec -it training_env bash
+
+         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+         repository and navigate to the benchmark scripts directory
+         ``/workspace/MAD/scripts/pytorch_train``.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD/scripts/pytorch_train
+
+         .. rubric:: Prepare training datasets and dependencies
+
+         The following benchmarking examples require downloading models and datasets
+         from Hugging Face. To ensure successful access to gated repos, set your
+         ``HF_TOKEN``.
+
+         .. code-block:: shell
+
+            export HF_TOKEN=$your_personal_hugging_face_access_token
+
+         Run the setup script to install libraries and datasets needed for benchmarking.
+
+         .. code-block:: shell
+
+            ./pytorch_benchmark_setup.sh
+
+         .. container:: model-doc pyt_train_llama-3.1-8b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+         .. container:: model-doc pyt_train_llama-3.1-70b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``torchdata``
+                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+               * - ``tomli``
+                 - `Tomli <https://pypi.org/project/tomli/>`_
+
+               * - ``tiktoken``
+                 - `tiktoken <https://github.com/openai/tiktoken>`_
+
+               * - ``blobfile``
+                 - `blobfile <https://pypi.org/project/blobfile/>`_
+
+               * - ``tabulate``
+                 - `tabulate <https://pypi.org/project/tabulate/>`_
+
+               * - ``wandb``
+                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+         .. container:: model-doc pyt_train_flux
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+               * - ``csvkit``
+                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+               * - ``deepspeed``
+                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+               * - ``diffusers``
+                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+               * - ``GitPython``
+                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+               * - ``opencv-python-headless``
+                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+               * - ``peft``
+                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+               * - ``protobuf``
+                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+               * - ``pytest``
+                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+               * - ``python-dotenv``
+                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+               * - ``seaborn``
+                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+               * - ``transformers``
+                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Pretraining
+
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
+               * - ``$datatype``
+                 - ``BF16`` or ``FP8``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% else %}
+               * - ``$datatype``
+                 - ``BF16``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% endif %}
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. note::
+
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+         {% endif %}
+
+         {% if model_group.tag == "fine-tuning" %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$training_mode``
+                 - ``finetune_fw``
+                 - Full weight fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_lora``
+                 - LoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_qlora``
+                 - QLoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``HF_finetune_lora``
+                 - LoRA fine-tuning with Hugging Face PEFT
+
+               * - ``$datatype``
+                 - ``BF16``
+                 - All models support BF16.
+
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.
+
+            .. note::
+
+               {{ model.model }} currently supports the following fine-tuning methods:
+
+            {% for method in model.training_modes %}
+               * ``{{ method }}``
+            {% endfor %}
+            {% if model.training_modes|length < 4 %}
+
+               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
+               does not currently provide YAML configuration files for other combinations of
+               model to fine-tuning method
+               However, you can still configure your own YAML files to enable support for
+               fine-tuning methods not listed here by following existing patterns in the
+               ``/workspace/torchtune/recipes/configs`` directory.
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+               .. rubric:: Benchmarking examples
+
+               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -55,32 +55,32 @@ vary by model -- select one to get started.
   {% set model_groups = data.model_groups %}
   .. raw:: html

-         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-           <div class="row">
-             <div class="col-2 me-2 model-param-head">Model</div>
-             <div class="row col-10">
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
-               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
-             </div>
-           </div>
+            </div>
+         </div>

-           <div class="row mt-1">
-             <div class="col-2 me-2 model-param-head">Model variant</div>
-             <div class="row col-10">
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
            {% if models|length % 3 == 0 %}
-               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% else %}
-               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% endif %}
         {% endfor %}
      {% endfor %}
-             </div>
-           </div>
+            </div>
         </div>
+      </div>

 .. note::

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.

-The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
-(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
-training workloads:
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.8.0a0+git7d205b2             |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.17                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.14.0+2f85f5f2                |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0.post1                    |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.15.0-8c6919d                 |
-+--------------------------+--------------------------------+
-| Triton                   | 3.3.0                          |
-+--------------------------+--------------------------------+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+   The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
+   (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+   model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+   training workloads:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

 .. _amd-pytorch-training-model-support:

@@ -38,119 +35,152 @@ Supported models
 ================

 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   {% set unified_docker = data.unified_docker.latest %}
+   {% set unified_docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}
-
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Workload</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
      </div>

-   .. note::

-      Some models require an external license agreement through a third party (for example, Meta).
+   .. _amd-pytorch-training-supported-training-modes:

-   .. _amd-pytorch-training-performance-measurements:
+   The following table lists supported training modes per model.

-   Performance measurements
-   ========================
+   .. dropdown:: Supported training modes

-   To evaluate performance, the
+      .. list-table::
+         :header-rows: 1
+
+         * - Model
+           - Supported training modes
+
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+         * - {{ model.model }}
+           - ``{{ model.training_modes | join('``, ``') }}``
+
+         {% endfor %}
+      {% endfor %}
+
+      .. note::
+
+         Some model and fine-tuning combinations are not listed. This is
+         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
+         doesn't provide default YAML configurations for them.
+         For advanced usage, you can create a custom configuration to enable
+         unlisted fine-tuning methods by using an existing file in the
+         ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   page provides reference throughput and latency measurements for training
-   popular AI models.
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.

-   .. note::
+System validation
+=================

-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.

-   System validation
-   =================
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.

-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.

-   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-   before starting training.
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.

-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+Run training
+============

-   This Docker image is optimized for specific model configurations outlined
-   below. Performance can vary for other training workloads, as AMD
-   doesn’t validate configurations and run conditions outside those described.
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   Benchmarking
-   ============
+   {% set unified_docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}

-   Once the setup is complete, choose between two options to start benchmarking:
+   Once the setup is complete, choose between two options to start benchmarking training:

   .. tab-set::

      .. tab-item:: MAD-integrated benchmarking

-         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-         directory and install the required packages on the host machine.
+         1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.

-         .. code-block:: shell
+            .. code-block:: shell

-            git clone https://github.com/ROCm/MAD
-            cd MAD
-            pip install -r requirements.txt
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt

   {% for model_group in model_groups %}
      {% for model in model_group.models %}

         .. container:: model-doc {{ model.mad_tag }}

-            For example, use this command to run the performance benchmark test on the {{ model.model }} model
-            using one GPU with the {{ model.precision }} data type on the host machine.
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.

-            .. code-block:: shell
+               .. code-block:: shell

-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               madengine run \
-                   --tags {{ model.mad_tag }} \
-                   --keep-model-dir \
-                   --live-output \
-                   --timeout 28800
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800

-            MAD launches a Docker container with the name
-            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/perf.csv``.
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.

      {% endfor %}
   {% endfor %}
@@ -159,222 +189,213 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

         .. rubric:: Download the Docker image and required packages

-         Use the following command to pull the Docker image from Docker Hub.
+         1. Use the following command to pull the Docker image from Docker Hub.

-         .. code-block:: shell
+            .. code-block:: shell

-            docker pull {{ unified_docker.pull_tag }}
+               docker pull {{ unified_docker.pull_tag }}

-         Run the Docker container.
+         2. Run the Docker container.

-         .. code-block:: shell
+            .. code-block:: shell

-            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ unified_docker.pull_tag }}

-         Use these commands if you exit the ``training_env`` container and need to return to it.
+            Use these commands if you exit the ``training_env`` container and need to return to it.

-         .. code-block:: shell
+            .. code-block:: shell

-            docker start training_env
-            docker exec -it training_env bash
+               docker start training_env
+               docker exec -it training_env bash

-         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-         repository and navigate to the benchmark scripts directory
-         ``/workspace/MAD/scripts/pytorch_train``.
+         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+            repository and navigate to the benchmark scripts directory
+            ``/workspace/MAD/scripts/pytorch_train``.

-         .. code-block:: shell
+            .. code-block:: shell

-            git clone https://github.com/ROCm/MAD
-            cd MAD/scripts/pytorch_train
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/pytorch_train

         .. rubric:: Prepare training datasets and dependencies

-         The following benchmarking examples require downloading models and datasets
-         from Hugging Face. To ensure successful access to gated repos, set your
-         ``HF_TOKEN``.
+         1. The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.

-         .. code-block:: shell
+            .. code-block:: shell

-            export HF_TOKEN=$your_personal_hugging_face_access_token
+               export HF_TOKEN=$your_personal_hugging_face_access_token

-         Run the setup script to install libraries and datasets needed for benchmarking.
+         2. Run the setup script to install libraries and datasets needed for benchmarking.

-         .. code-block:: shell
+            .. code-block:: shell

-            ./pytorch_benchmark_setup.sh
+               ./pytorch_benchmark_setup.sh

-         .. container:: model-doc pyt_train_llama-3.1-8b
+            .. container:: model-doc pyt_train_llama-3.1-8b

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:

-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1

-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference

-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-         .. container:: model-doc pyt_train_llama-3.1-70b
+            .. container:: model-doc pyt_train_llama-3.1-70b

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:

-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1

-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-               * - ``torchdata``
-                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+                  * - ``torchdata``
+                    - `TorchData <https://pytorch.org/data/beta/index.html>`_

-               * - ``tomli``
-                 - `Tomli <https://pypi.org/project/tomli/>`_
+                  * - ``tomli``
+                    - `Tomli <https://pypi.org/project/tomli/>`_

-               * - ``tiktoken``
-                 - `tiktoken <https://github.com/openai/tiktoken>`_
+                  * - ``tiktoken``
+                    - `tiktoken <https://github.com/openai/tiktoken>`_

-               * - ``blobfile``
-                 - `blobfile <https://pypi.org/project/blobfile/>`_
+                  * - ``blobfile``
+                    - `blobfile <https://pypi.org/project/blobfile/>`_

-               * - ``tabulate``
-                 - `tabulate <https://pypi.org/project/tabulate/>`_
+                  * - ``tabulate``
+                    - `tabulate <https://pypi.org/project/tabulate/>`_

-               * - ``wandb``
-                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+                  * - ``wandb``
+                    - `Weights & Biases <https://github.com/wandb/wandb>`_

-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-         .. container:: model-doc pyt_train_flux
+            .. container:: model-doc pyt_train_flux

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:

-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1

-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference

-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-               * - ``csvkit``
-                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+                  * - ``csvkit``
+                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1

-               * - ``deepspeed``
-                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+                  * - ``deepspeed``
+                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2

-               * - ``diffusers``
-                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+                  * - ``diffusers``
+                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0

-               * - ``GitPython``
-                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+                  * - ``GitPython``
+                    - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44

-               * - ``opencv-python-headless``
-                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+                  * - ``opencv-python-headless``
+                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84

-               * - ``peft``
-                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+                  * - ``peft``
+                    - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0

-               * - ``protobuf``
-                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+                  * - ``protobuf``
+                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2

-               * - ``pytest``
-                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+                  * - ``pytest``
+                    - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4

-               * - ``python-dotenv``
-                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+                  * - ``python-dotenv``
+                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1

-               * - ``seaborn``
-                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+                  * - ``seaborn``
+                    - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2

-               * - ``transformers``
-                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+                  * - ``transformers``
+                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0

-         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:

-         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+            * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
-         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "pretrain": "Benchmark pre-training.",
+            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+         } %}
+         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+         {% if available_modes %}

         .. container:: model-doc {{ model.mad_tag }}

-            .. rubric:: Pretraining
+            .. rubric:: Pre-training

            To start the pre-training benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.

            .. code-block:: shell

-               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
-
-            .. list-table::
-               :header-rows: 1
-
-               * - Name
-                 - Options
-                 - Description
-
-            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
-               * - ``$datatype``
-                 - ``BF16`` or ``FP8``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% else %}
-               * - ``$datatype``
-                 - ``BF16``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% endif %}
-
-               * - ``$sequence_length``
-                 - Sequence length for the language model.
-                 - Between 2048 and 8192. 8192 by default.
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length

            {% if model.mad_tag == "pyt_train_flux" %}
            .. container:: model-doc {{ model.mad_tag }}

               .. note::

+                  Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
+                  To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
+
                  Occasionally, downloading the Flux dataset might fail. In the event of this
                  error, manually download it from Hugging Face at
                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
                  the required dataset.
            {% endif %}
-         {% endif %}
-
-         {% if model_group.tag == "fine-tuning" %}
-         .. container:: model-doc {{ model.mad_tag }}
-
-            .. rubric:: Fine-tuning
-
-            To start the fine-tuning benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length

            .. list-table::
               :header-rows: 1
@@ -383,53 +404,143 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
                 - Options
                 - Description

-               * - ``$training_mode``
-                 - ``finetune_fw``
-                 - Full weight fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_lora``
-                 - LoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_qlora``
-                 - QLoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``HF_finetune_lora``
-                 - LoRA fine-tuning with Hugging Face PEFT
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}

               * - ``$datatype``
-                 - ``BF16``
-                 - All models support BF16.
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_mode_descs = {
+            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+         {% if available_modes %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}

               * - ``$sequence_length``
                 - Between 2048 and 16384.
                 - Sequence length for the language model.

+            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
            .. note::

-               {{ model.model }} currently supports the following fine-tuning methods:
+               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+               use the following torchtune commit for compatibility:

-            {% for method in model.training_modes %}
-               * ``{{ method }}``
-            {% endfor %}
-            {% if model.training_modes|length < 4 %}
+               .. code-block:: shell
+
+                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+            .. note::
+
+               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+               input tensor should be smaller than max_seq_len (4096)``.
+               This error indicates that an input sequence is longer than the model's maximum context window.
+
+               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+               tokens in this case). You can resolve this by truncating the input or splitting
+               it into smaller chunks before passing it to the model.
+
+               Note on reproducibility: The results in this guide are based on
+               commit ``b4c98ac`` from the upstream
+               `<https://github.com/pytorch/torchtune>`__ repository. For the
+               latest updates, you can use the main branch.

-               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
-               does not currently provide YAML configuration files for other combinations of
-               model to fine-tuning method
-               However, you can still configure your own YAML files to enable support for
-               fine-tuning methods not listed here by following existing patterns in the
-               ``/workspace/torchtune/recipes/configs`` directory.
            {% endif %}
         {% endif %}
      {% endfor %}
   {% endfor %}

-               .. rubric:: Benchmarking examples
+            .. rubric:: Benchmarking examples

-               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+Multi-node training
+-------------------
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   huggingface-cli login # Get access to HF Llama model space
+   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch Torchtune_Multinode.sh
+
+.. note::
+
+   Information regarding benchmark setup:
+
+   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+   * You can adjust the torchtune `YAML configuration file
+     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
+     if you're using a different model.
+   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+   * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.

 Further reading
 ===============
--- a/docs/index.md
+++ b/docs/index.md
@@ -65,7 +65,7 @@ ROCm documentation is organized into the following categories:
 * [ROCm libraries](./reference/api-libraries.md)
 * [ROCm tools, compilers, and runtimes](./reference/rocm-tools.md)
 * [Accelerator and  GPU hardware specifications](./reference/gpu-arch-specs.rst)
-* [Precision support](./reference/precision-support.rst)
+* [Data types and precision support](./reference/precision-support.rst)
 * [Graph safe support](./reference/graph-safe-support.rst)
 <!-- markdownlint-enable MD051 -->
 :::
--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -34,6 +34,40 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - SGPR File (KiB)
          - GFXIP Major version
          - GFXIP Minor version
+        *
+          - MI355X
+          - CDNA4
+          - gfx950
+          - 288
+          - 256 (32 per XCD)
+          - 64
+          - 160
+          - 256
+          - 32 (4 per XCD)
+          - 32
+          - 16 per 2 CUs
+          - 64 per 2 CUs
+          - 512
+          - 12.5
+          - 9
+          - 5
+        *
+          - MI350X
+          - CDNA4
+          - gfx950
+          - 288
+          - 256 (32 per XCD)
+          - 64
+          - 160
+          - 256
+          - 32 (4 per XCD)
+          - 32
+          - 16 per 2 CUs
+          - 64 per 2 CUs
+          - 512
+          - 12.5
+          - 9
+          - 5
        *
          - MI325X
          - CDNA3
--- a/docs/reference/gpu-atomics-operation.rst
+++ b/docs/reference/gpu-atomics-operation.rst
@@ -14,16 +14,26 @@ completed as an indivisible unit, preventing race conditions where simultaneous
 access to the same memory location could lead to incorrect or undefined
 behavior.

-This document details the various support of atomic read-modify-write
-(atomicRMW) operations on gfx9, gfx10, gfx11, gfx12, MI100, MI200 and MI300 AMD
-GPUs. The atomics operation type behavior effected by the memory locations,
-memory granularity or scope of operations.
+This topic summarizes the support of atomic read-modify-write
+(atomicRMW) operations on AMD GPUs and accelerators. This includes gfx9, gfx10,
+gfx11, and gfx12 targets and the following series of Instinct™ series:
+
+- MI100
+
+- MI200
+
+- MI300
+
+- MI350 
+
+The atomics operation type behavior is affected by the memory locations, memory
+granularity, and scope of operations.

 Memory locations:

- :ref:`Device memory <hip:device_memory>`, i.e. VRAM, the RAM on a discrete GPU
-  device or in framebuffer carveout for APUs. This includes peer-device memory
-  within an Infinity Fabric™ hive.
+- :ref:`Device memory <hip:device_memory>`, that is, VRAM, the RAM on a discrete
+  GPU device or in framebuffer carveout for APUs. This includes peer-device
+  memory within an Infinity Fabric™ hive.

 - :ref:`Host memory <hip:host_memory>`: in DRAM associated with the CPU (or
  peer device memory using PCIe® (PCI Express) peer-to-peer). This can be two sub-types:
@@ -69,10 +79,10 @@ Scopes of operations:
 Support summary
 ================================================================================

-AMD Instinct™ accelerators
+AMD Instinct accelerators
 --------------------------------------------------------------------------------

-**MI300**
+**MI300 and MI350 series**

 - All atomicRMW operations are forwarded out to the Infinity Fabric.
 - Infinity Fabric supports common integer and bitwise atomics, FP32 atomic add,
@@ -85,7 +95,7 @@ AMD Instinct™ accelerators
  It will seem like atomics to the wave, but the CPU sees it as a non-atomic
  load-op-store sequence. This downgrades system-scope atomics to device-scope.

-**MI200**
+**MI200 series**

 - L2 cache and Infinity Fabric both support common integer and bitwise atomics.
 - L2 cache supports FP32 atomic add, packed-FP16 atomic add, and FP64 add,
--- a/docs/reference/precision-support.rst
+++ b/docs/reference/precision-support.rst
--- a/docs/release/versions.md
+++ b/docs/release/versions.md
@@ -10,6 +10,7 @@

 | Version | Release date |
 | ------- | ------------ |
+| [7.0.0](https://rocm.docs.amd.com/en/docs-7.0.0/) | September 16, 2025 |
 | [6.4.3](https://rocm.docs.amd.com/en/docs-6.4.3/) | August 7, 2025 |
 | [6.4.2](https://rocm.docs.amd.com/en/docs-6.4.2/) | July 21, 2025 |
 | [6.4.1](https://rocm.docs.amd.com/en/docs-6.4.1/) | May 21, 2025 |
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -32,19 +32,23 @@ subtrees:
      - file: compatibility/ml-compatibility/pytorch-compatibility.rst
        title: PyTorch compatibility
      - file: compatibility/ml-compatibility/tensorflow-compatibility.rst
-        title: TensorFlow compatibility  
+        title: TensorFlow compatibility
      - file: compatibility/ml-compatibility/jax-compatibility.rst
        title: JAX compatibility
      - file: compatibility/ml-compatibility/verl-compatibility.rst
-        title: verl compatibility  
+        title: verl compatibility
      - file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
        title: Stanford Megatron-LM compatibility
      - file: compatibility/ml-compatibility/dgl-compatibility.rst
-        title: DGL compatibility  
+        title: DGL compatibility
      - file: compatibility/ml-compatibility/megablocks-compatibility.rst
        title: Megablocks compatibility
      - file: compatibility/ml-compatibility/taichi-compatibility.rst
-        title: Taichi compatibility 
+        title: Taichi compatibility
+      - file: compatibility/ml-compatibility/ray-compatibility.rst
+        title: Ray compatibility
+      - file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
+        title: llama.cpp compatibility
  - file: how-to/build-rocm.rst
    title: Build ROCm from source

@@ -143,7 +147,7 @@ subtrees:
  - file: how-to/setting-cus
    title: Set the number of CUs
  - file: how-to/Bar-Memory.rst
-    title: Troubleshoot BAR access limitation  
+    title: Troubleshoot BAR access limitation
  - url: https://github.com/amd/rocm-examples
    title: ROCm examples

@@ -163,7 +167,9 @@ subtrees:
          - url: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
            title: White paper
          - file: conceptual/gpu-arch/mi300-mi200-performance-counters.rst
-            title: MI300 and MI200 Performance counter
+            title: MI300 and MI200 performance counters
+          - file: conceptual/gpu-arch/mi350-performance-counters.rst
+            title: MI350 series performance counters
      - file: conceptual/gpu-arch/mi250.md
        title: MI250 microarchitecture
        subtrees:
@@ -198,7 +204,7 @@ subtrees:
  - file: reference/gpu-arch-specs.rst
  - file: reference/gpu-atomics-operation.rst
  - file: reference/precision-support.rst
-    title: Precision support
+    title: Data types and precision support
  - file: reference/graph-safe-support.rst
    title: Graph safe support

--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,4 +1,4 @@
-rocm-docs-core==1.20.1
+rocm-docs-core==1.23.0
 sphinx-reredirects
 sphinx-sitemap
 sphinxcontrib.datatemplates==0.11.0
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile requirements.in
+#    pip-compile /mnt/nonstandard/ROCm/requirements.in
 #
 accessible-pygments==0.0.5
    # via pydata-sphinx-theme
@@ -19,34 +19,32 @@ babel==2.17.0
    # via
    #   pydata-sphinx-theme
    #   sphinx
-beautifulsoup4==4.13.4
+beautifulsoup4==4.13.5
    # via pydata-sphinx-theme
 breathe==4.36.0
    # via rocm-docs-core
-certifi==2025.4.26
+certifi==2025.8.3
    # via requests
-cffi==1.17.1
+cffi==2.0.0
    # via
    #   cryptography
    #   pynacl
-charset-normalizer==3.4.2
+charset-normalizer==3.4.3
    # via requests
 click==8.2.1
    # via
    #   jupyter-cache
    #   sphinx-external-toc
-comm==0.2.2
+comm==0.2.3
    # via ipykernel
-cryptography==45.0.3
+cryptography==45.0.7
    # via pyjwt
-debugpy==1.8.14
+debugpy==1.8.16
    # via ipykernel
 decorator==5.2.1
    # via ipython
 defusedxml==0.7.1
    # via sphinxcontrib-datatemplates
-deprecated==1.2.18
-    # via pygithub
 docutils==0.21.2
    # via
    #   myst-parser
@@ -54,17 +52,17 @@ docutils==0.21.2
    #   sphinx
 exceptiongroup==1.3.0
    # via ipython
-executing==2.2.0
+executing==2.2.1
    # via stack-data
-fastjsonschema==2.21.1
+fastjsonschema==2.21.2
    # via
    #   nbformat
    #   rocm-docs-core
 gitdb==4.0.12
    # via gitpython
-gitpython==3.1.44
+gitpython==3.1.45
    # via rocm-docs-core
-greenlet==3.2.3
+greenlet==3.2.4
    # via sqlalchemy
 idna==3.10
    # via requests
@@ -74,7 +72,7 @@ importlib-metadata==8.7.0
    # via
    #   jupyter-cache
    #   myst-nb
-ipykernel==6.29.5
+ipykernel==6.30.1
    # via myst-nb
 ipython==8.37.0
    # via
@@ -86,9 +84,9 @@ jinja2==3.1.6
    # via
    #   myst-parser
    #   sphinx
-jsonschema==4.24.0
+jsonschema==4.25.1
    # via nbformat
-jsonschema-specifications==2025.4.1
+jsonschema-specifications==2025.9.1
    # via jsonschema
 jupyter-cache==1.0.1
    # via myst-nb
@@ -112,11 +110,11 @@ matplotlib-inline==0.1.7
    # via
    #   ipykernel
    #   ipython
-mdit-py-plugins==0.4.2
+mdit-py-plugins==0.5.0
    # via myst-parser
 mdurl==0.1.2
    # via markdown-it-py
-myst-nb==1.2.0
+myst-nb==1.3.0
    # via rocm-docs-core
 myst-parser==4.0.1
    # via myst-nb
@@ -134,15 +132,14 @@ nest-asyncio==1.6.0
 packaging==25.0
    # via
    #   ipykernel
-    #   pydata-sphinx-theme
    #   sphinx
-parso==0.8.4
+parso==0.8.5
    # via jedi
 pexpect==4.9.0
    # via ipython
-platformdirs==4.3.8
+platformdirs==4.4.0
    # via jupyter-core
-prompt-toolkit==3.0.51
+prompt-toolkit==3.0.52
    # via ipython
 psutil==7.0.0
    # via ipykernel
@@ -150,15 +147,15 @@ ptyprocess==0.7.0
    # via pexpect
 pure-eval==0.2.3
    # via stack-data
-pycparser==2.22
+pycparser==2.23
    # via cffi
-pydata-sphinx-theme==0.15.4
+pydata-sphinx-theme==0.16.1
    # via
    #   rocm-docs-core
    #   sphinx-book-theme
-pygithub==2.6.1
+pygithub==2.8.1
    # via rocm-docs-core
-pygments==2.19.1
+pygments==2.19.2
    # via
    #   accessible-pygments
    #   ipython
@@ -166,7 +163,7 @@ pygments==2.19.1
    #   sphinx
 pyjwt[crypto]==2.10.1
    # via pygithub
-pynacl==1.5.0
+pynacl==1.6.0
    # via pygithub
 python-dateutil==2.9.0.post0
    # via jupyter-client
@@ -178,7 +175,7 @@ pyyaml==6.0.2
    #   rocm-docs-core
    #   sphinx-external-toc
    #   sphinxcontrib-datatemplates
-pyzmq==26.4.0
+pyzmq==27.1.0
    # via
    #   ipykernel
    #   jupyter-client
@@ -186,13 +183,13 @@ referencing==0.36.2
    # via
    #   jsonschema
    #   jsonschema-specifications
-requests==2.32.4
+requests==2.32.5
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.20.1
-    # via -r requirements.in
-rpds-py==0.25.1
+rocm-docs-core==1.23.0
+    # via -r /mnt/nonstandard/ROCm/requirements.in
+rpds-py==0.27.1
    # via
    #   jsonschema
    #   referencing
@@ -202,7 +199,7 @@ smmap==5.0.2
    # via gitdb
 snowballstemmer==3.0.1
    # via sphinx
-soupsieve==2.7
+soupsieve==2.8
    # via beautifulsoup4
 sphinx==8.1.3
    # via
@@ -220,7 +217,7 @@ sphinx==8.1.3
    #   sphinx-reredirects
    #   sphinxcontrib-datatemplates
    #   sphinxcontrib-runcmd
-sphinx-book-theme==1.1.4
+sphinx-book-theme==1.1.3
    # via rocm-docs-core
 sphinx-copybutton==0.5.2
    # via rocm-docs-core
@@ -233,13 +230,13 @@ sphinx-last-updated-by-git==0.3.8
 sphinx-notfound-page==1.1.0
    # via rocm-docs-core
 sphinx-reredirects==0.1.6
-    # via -r requirements.in
+    # via -r /mnt/nonstandard/ROCm/requirements.in
 sphinx-sitemap==2.8.0
-    # via -r requirements.in
+    # via -r /mnt/nonstandard/ROCm/requirements.in
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
 sphinxcontrib-datatemplates==0.11.0
-    # via -r requirements.in
+    # via -r /mnt/nonstandard/ROCm/requirements.in
 sphinxcontrib-devhelp==2.0.0
    # via sphinx
 sphinxcontrib-htmlhelp==2.1.0
@@ -252,7 +249,7 @@ sphinxcontrib-runcmd==0.2.0
    # via sphinxcontrib-datatemplates
 sphinxcontrib-serializinghtml==2.0.0
    # via sphinx
-sqlalchemy==2.0.41
+sqlalchemy==2.0.43
    # via jupyter-cache
 stack-data==0.6.3
    # via ipython
@@ -260,13 +257,12 @@ tabulate==0.9.0
    # via jupyter-cache
 tomli==2.2.1
    # via sphinx
-tornado==6.5.1
+tornado==6.5.2
    # via
    #   ipykernel
    #   jupyter-client
 traitlets==5.14.3
    # via
-    #   comm
    #   ipykernel
    #   ipython
    #   jupyter-client
@@ -274,7 +270,7 @@ traitlets==5.14.3
    #   matplotlib-inline
    #   nbclient
    #   nbformat
-typing-extensions==4.14.0
+typing-extensions==4.15.0
    # via
    #   beautifulsoup4
    #   exceptiongroup
@@ -290,7 +286,5 @@ urllib3==2.5.0
    #   requests
 wcwidth==0.2.13
    # via prompt-toolkit
-wrapt==1.17.2
-    # via deprecated
 zipp==3.23.0
    # via importlib-metadata
--- a/docs/sphinx/static/css/vllm-benchmark.css
+++ b/docs/sphinx/static/css/vllm-benchmark.css
@@ -7,15 +7,14 @@ html {
  --compat-head-color: var(--pst-color-surface);
  --compat-param-hover-color: var(--pst-color-link-hover);
  --compat-param-selected-color: var(--pst-color-primary);
+  --compat-border-color: var(--pst-color-border);
 }

 html[data-theme="light"] {
-  --compat-border-color: var(--pst-gray-500);
  --compat-param-disabled-color: var(--pst-gray-300);
 }

 html[data-theme="dark"] {
-  --compat-border-color: var(--pst-gray-600);
  --compat-param-disabled-color: var(--pst-gray-600);
 }

@@ -23,6 +22,7 @@ div#vllm-benchmark-ud-params-picker.container-fluid {
  padding: 0 0 1rem 0;
 }

+div[data-param-k="model-group"],
 div[data-param-k="model"] {
  background-color: var(--compat-bg-color);
  padding: 2px;
@@ -31,40 +31,19 @@ div[data-param-k="model"] {
  cursor: pointer;
 }

+div[data-param-k="model-group"][data-param-state="selected"],
 div[data-param-k="model"][data-param-state="selected"] {
  background-color: var(--compat-param-selected-color);
  color: var(--compat-fg-color);
 }

-div[data-param-k="model"][data-param-state="latest-version"] {
-  background-color: var(--compat-param-selected-color);
-  color: var(--compat-fg-color);
-}
-
-div[data-param-k="model"][data-param-state="disabled"] {
-  background-color: var(--compat-param-disabled-color);
-  text-decoration: line-through;
-  /* text-decoration-color: var(--pst-color-danger); */
-  cursor: auto;
-}
-
-div[data-param-k="model"]:not([data-param-state]):hover {
+div[data-param-k="model-group"]:hover,
+div[data-param-k="model"]:hover {
  background-color: var(--compat-param-hover-color);
-}
-
-div[data-param-k="model-group"] {
-  background-color: var(--compat-bg-color);
-  padding: 2px;
-  border: solid 1px var(--compat-border-color);
-  font-weight: 500;
-  cursor: pointer;
-}
-
-div[data-param-k="model-group"][data-param-state="selected"] {
-  background-color: var(--compat-param-selected-color);
  color: var(--compat-fg-color);
 }

+/*
 div[data-param-k="model-group"][data-param-state="latest-version"] {
  background-color: var(--compat-param-selected-color);
  color: var(--compat-fg-color);
@@ -73,26 +52,19 @@ div[data-param-k="model-group"][data-param-state="latest-version"] {
 div[data-param-k="model-group"][data-param-state="disabled"] {
  background-color: var(--compat-param-disabled-color);
  text-decoration: line-through;
-  /* text-decoration-color: var(--pst-color-danger); */
+  text-decoration-color: var(--pst-color-danger);
  cursor: auto;
 }
-
-div[data-param-k="model-group"]:not([data-param-state]):hover {
-  background-color: var(--compat-param-hover-color);
-}
+*/

 .model-param-head {
  background-color: var(--compat-head-color);
  padding: 0.15rem 0.15rem 0.15rem 0.67rem;
-  /* margin: 2px; */
-  border-right: solid 2px var(--compat-accent-color);
+  border-right: solid 3px var(--compat-accent-color);
  font-weight: 600;
 }

 .model-param {
-  /* padding: 2px; */
-  /* margin: 0 2px 0 2px; */
-  /* margin: 2px; */
  border: solid 1px var(--compat-border-color);
  font-weight: 500;
 }
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -10,7 +10,7 @@ ROCm is a software stack, composed primarily of open-source software, that
 provides the tools for programming AMD Graphics Processing Units (GPUs), from
 low-level kernels to high-level end-user applications.

-.. image:: data/rocm-software-stack-6_4_0.jpg
+.. image:: data/rocm-software-stack-7_0_0.jpg
  :width: 800
  :alt: AMD's ROCm software stack and enabling technologies.
  :align: center
@@ -45,6 +45,10 @@ Machine Learning & Computer Vision
  ":doc:`rocJPEG <rocjpeg:index>`", "Library for decoding JPG images on AMD GPUs"
  ":doc:`rocPyDecode <rocpydecode:index>`", "Provides access to rocDecode APIs in both Python and C/C++ languages"

+.. note::
+
+  `rocCV <https://rocm.docs.amd.com/projects/rocCV/en/latest/index.html>`_  is an efficient GPU-accelerated library for image pre- and post-processing. rocCV is in an early access state. Using it on production workloads is not recommended.
+
 Communication
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/tools/rocm-build/rocm-7.0.0.xml
+++ b/tools/rocm-build/rocm-7.0.0.xml
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<manifest>
+    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
+    <default revision="refs/tags/rocm-7.0.0"
+     remote="rocm-org"
+     sync-c="true"
+     sync-j="4" />
+<!--list of projects for ROCm-->
+    <project name="ROCm" revision="roc-7.0.x" />
+    <project name="ROCK-Kernel-Driver" />
+    <project name="ROCR-Runtime" />
+    <project name="amdsmi" />
+    <project name="aqlprofile" />
+    <project name="rdc" />
+    <project name="rocm_bandwidth_test" />
+    <project name="rocm_smi_lib" />
+    <project name="rocm-core" />
+    <project name="rocm-examples" />
+    <project name="rocminfo" />
+    <project name="rocprofiler" />
+    <project name="rocprofiler-register" />
+    <project name="rocprofiler-sdk" />
+    <project name="rocprofiler-compute" />
+    <project name="rocprofiler-systems" />
+    <project name="roctracer" />
+<!--HIP Projects-->
+    <project name="hip" />
+    <project name="hip-tests" />
+    <project name="HIPIFY" />
+    <project name="clr" />
+    <project name="hipother" />
+<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
+    <project name="half" />
+    <project name="llvm-project" />
+    <project name="spirv-llvm-translator" />
+<!-- gdb projects -->
+    <project name="ROCdbgapi" />
+    <project name="ROCgdb" />
+    <project name="rocr_debug_agent" />
+<!-- ROCm Libraries -->
+    <project groups="mathlibs" name="AMDMIGraphX" />
+    <project groups="mathlibs" name="MIVisionX" />
+    <project groups="mathlibs" name="ROCmValidationSuite" />
+    <project groups="mathlibs" name="composable_kernel" />
+    <project groups="mathlibs" name="hipSOLVER" />
+    <project groups="mathlibs" name="hipTensor" />
+    <project groups="mathlibs" name="hipfort" />
+    <project groups="mathlibs" name="rccl" />
+    <project groups="mathlibs" name="rocAL" />
+    <project groups="mathlibs" name="rocALUTION" />
+    <project groups="mathlibs" name="rocDecode" />
+    <project groups="mathlibs" name="rocJPEG" />
+    <!-- The following components have been migrated to rocm-libraries:
+        hipBLAS-common hipBLAS hipBLASLt hipCUB
+        hipFFT hipRAND hipSPARSE hipSPARSELt
+        MIOpen rocBLAS rocFFT rocPRIM rocRAND
+        rocSPARSE rocThrust Tensile -->
+    <project groups="mathlibs" name="rocm-libraries" />
+    <project groups="mathlibs" name="rocPyDecode" />
+    <project groups="mathlibs" name="rocSHMEM" />
+    <project groups="mathlibs" name="rocSOLVER" />
+    <project groups="mathlibs" name="rocWMMA" />
+    <project groups="mathlibs" name="rocm-cmake" />
+    <project groups="mathlibs" name="rpp" />
+    <project groups="mathlibs" name="TransferBench" />
+<!-- Projects for OpenMP-Extras -->
+    <project name="aomp" path="openmp-extras/aomp" />
+    <project name="aomp-extras" path="openmp-extras/aomp-extras" />
+    <project name="flang" path="openmp-extras/flang" />
+</manifest>