Merge pull request #639 from ROCm/sync-develop-from-external

Sync develop from external
2026-01-06 21:33:57 -05:00 · 2025-11-26 11:10:19 -05:00
parent 7fd75919d1 2f7b2a7fa1
commit bb199aa2b9
21 changed files with 415 additions and 153 deletions
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -128,6 +128,9 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+      parameters:
+        cmakeVersion: '3.28.6'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -152,6 +155,7 @@ jobs:
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -DAMDGPU_TARGETS=${{ job.target }}
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
@@ -192,6 +196,9 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+      parameters:
+        cmakeVersion: '3.28.6'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -217,6 +224,7 @@ jobs:
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -DAMDGPU_TARGETS=${{ job.target }}
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
--- a/.azuredevops/components/amdsmi.yml
+++ b/.azuredevops/components/amdsmi.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: amdsmi
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -31,7 +50,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: amdsmi_build_${{ job.os }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -55,6 +74,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -65,50 +85,54 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: amdsmi_test_${{ job.os }}_${{ job.target }}
-    dependsOn: amdsmi_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: amdsmi
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipTensor
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -51,7 +70,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipTensor_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -66,12 +85,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -85,9 +107,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -95,44 +120,47 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipTensor_test_${{ job.target }}
-    timeoutInMinutes: 90
-    dependsOn: hipTensor_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipTensor
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
-        testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.target }}
+      timeoutInMinutes: 90
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
+          testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocWMMA.yml
+++ b/.azuredevops/components/rocWMMA.yml
@@ -142,7 +142,7 @@ jobs:
 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      timeoutInMinutes: 270
+      timeoutInMinutes: 350
      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
      condition:
        and(succeeded(),
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -62,6 +62,7 @@ parameters:
    - llvm-project
    - MIOpen
    - MIVisionX
+    - rocm_smi_lib
    - rccl
    - rocALUTION
    - rocBLAS
@@ -100,6 +101,7 @@ parameters:
    - llvm-project
    - MIOpen
    - MIVisionX
+    - rocm_smi_lib
    - rccl
    - rocALUTION
    - rocBLAS
@@ -146,6 +148,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
      parameters:
@@ -245,5 +248,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
        environment: test
        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -65,6 +65,13 @@ parameters:
    - pytest
    - pytest-cov
    - pytest-xdist
+- name: rocmDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - ROCR-Runtime
+    - rocprofiler-sdk
 - name: rocmTestDependencies
  type: object
  default:
@@ -101,10 +108,12 @@ jobs:
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
+          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
    pool:
      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
@@ -119,6 +128,14 @@ jobs:
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        gpuTarget: ${{ job.target }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -63,6 +63,7 @@ parameters:
    libopenblas-dev: openblas-devel
    libopenmpi-dev: openmpi-devel
    libpci-dev: libpciaccess-devel
+    libsimde-dev: simde-devel
    libssl-dev: openssl-devel
    # note: libstdc++-devel is in the base packages list
    libsystemd-dev: systemd-devel
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -35,8 +35,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    amdsmi:
-      pipelineId: 99
-      developBranch: amd-staging
+      pipelineId: 376
+      developBranch: develop
      hasGpuTarget: false
    aomp-extras:
      pipelineId: 111
@@ -115,7 +115,7 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    hipTensor:
-      pipelineId: 105
+      pipelineId: 374
      developBranch: develop
      hasGpuTarget: true
    llvm-project:
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -140,6 +140,7 @@ EoS
 etcd
 fas
 FBGEMM
+FiLM
 FIFOs
 FFT
 FFTs
@@ -160,10 +161,12 @@ Fortran
 Fuyu
 GALB
 GAT
+GATNE
 GCC
 GCD
 GCDs
 GCN
+GCNN
 GDB
 GDDR
 GDR
@@ -182,6 +185,8 @@ Glibc
 GLXT
 Gloo
 GMI
+GNN
+GNNs
 GPG
 GPR
 GPT
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -233,7 +233,7 @@ for a complete overview of this release.
 * Fixed certain output in `amd-smi monitor` when GPUs are partitioned. It fixes the issue with amd-smi monitor such as: `amd-smi monitor -Vqt`, `amd-smi monitor -g 0 -Vqt -w 1`, and `amd-smi monitor -Vqt --file /tmp/test1`. These commands will now be able to display as normal in partitioned GPU scenarios.

 ```{note}
-See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md) for details, examples, and in-depth descriptions.
+See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md#amd_smi_lib-for-rocm-710) for details, examples, and in-depth descriptions.
 ```

 ### **Composable Kernel** (1.1.0)
@@ -677,7 +677,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
  * Enabled `TCP_TCP_LATENCY` counter and associated counter for all GPUs except MI300.
 * Interactive metric descriptions in TUI analyze mode.
  * You can now left click on any metric cell to view detailed descriptions in the dedicated `METRIC DESCRIPTION` tab.
-* Support for analysis report output as a sqlite database using ``--output-format db`` analysis mode option.
+* Support for analysis report output as a SQLite database using ``--output-format db`` analysis mode option.
 * `Compute Throughput` panel to TUI's `High Level Analysis` category with the following metrics: VALU FLOPs, VALU IOPs, MFMA FLOPs (F8), MFMA FLOPs (BF16), MFMA FLOPs (F16), MFMA FLOPs (F32), MFMA FLOPs (F64), MFMA FLOPs (F6F4) (in gfx950), MFMA IOPs (Int8), SALU Utilization, VALU Utilization, MFMA Utilization, VMEM Utilization, Branch Utilization, IPC

 * `Memory Throughput` panel to TUI's `High Level Analysis` category with the following metrics: vL1D Cache BW, vL1D Cache Utilization, Theoretical LDS Bandwidth, LDS Utilization, L2 Cache BW, L2 Cache Utilization, L2-Fabric Read BW, L2-Fabric Write BW, sL1D Cache BW, L1I BW, Address Processing Unit Busy, Data-Return Busy, L1I-L2 Bandwidth, sL1D-L2 BW
@@ -763,7 +763,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
 * MI300A/X L2-Fabric 64B read counter may display negative values - The rocprof-compute metric 17.6.1 (Read 64B) can report negative values due to incorrect calculation when TCC_BUBBLE_sum + TCC_EA0_RDREQ_32B_sum exceeds TCC_EA0_RDREQ_sum.
  * A workaround has been implemented using max(0, calculated_value) to prevent negative display values while the root cause is under investigation.
 * The profile mode crashes when `--format-rocprof-output json` is selected.
-  * As a workaround, this option should either not be provided or should be set to `csv` instead of `json`. This issue does not affect the profiling results since both `csv` and `json` output formats lead to the same profiling data.  
+    * As a workaround, this option should either not be provided or should be set to `csv` instead of `json`. This issue does not affect the profiling results since both `csv` and `json` output formats lead to the same profiling data.  

 ### **ROCm Data Center Tool** (1.2.0)

@@ -804,6 +804,14 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
 - Updated PAPI module to v7.2.0b2.
 - ROCprofiler-SDK is now used for tracing OMPT API calls.

+#### Known issues
+
+* Profiling PyTorch and other AI workloads might fail because it is unable to find the libraries in the default linker path. As a workaround, you need to explicitly add the library path to ``LD_LIBRARY_PATH``. For example, when using PyTorch with Python 3.10, add the following to the environment:
+
+```
+export LD_LIBRARY_PATH=:/opt/venv/lib/python3.10/site-packages/torch/lib:$LD_LIBRARY_PATH
+```
+
 ### **rocPRIM** (4.1.0)

 #### Added
@@ -881,17 +889,12 @@ As of ROCm 7.0, the internal error state is cleared on each call to `hipGetLastE

 ### **rocSOLVER** (3.31.0)

-#### Added
-
-* Hybrid computation support for existing routines: STEQR
-
 #### Optimized

 Improved the performance of:

-* BDSQR and downstream functions such as GESVD.
-* STEQR and downstream functions such as SYEV/HEEV.
-* LARFT and downstream functions such as GEQR2 and GEQRF.
+* LARF, LARFT, GEQR2, and downstream functions such as GEQRF.
+* STEDC and divide and conquer Eigensolvers.

 ### **rocSPARSE** (4.1.0)

--- a/RELEASE.md
+++ b/RELEASE.md
@@ -673,6 +673,10 @@ For a historical overview of ROCm component updates, see the {doc}`ROCm consolid

 - Fixed output for `amd-smi xgmi -l --json`.  

+```{note}
+See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md#amd_smi_lib-for-rocm-710) for details, examples, and in-depth descriptions.
+```
+
 ### **Composable Kernel** (1.1.0)

 #### Upcoming changes
@@ -860,6 +864,22 @@ be fixed in a future ROCm release.

 Due to partial data corruption of Electrically Erasable Programmable Read-Only Memory (EEPROM) and limited error handling in the AMD GPU Driver(amdgpu), excessive log output might result when querying the reliability, availability, and serviceability (RAS) bad pages. This issue will be fixed in a future AMD GPU Driver(amdgpu) and ROCm release.

+### OpenBLAS runtime dependency for hipblastlt-test and hipblaslt-bench
+
+Running `hipblaslt-test` or `hipblaslt-bench` without installing the OpenBLAS development package results in the following error:
+```
+libopenblas.so.0: cannot open shared object file: No such file or directory
+```
+As a workaround, first install `libopenblas-dev` or `libopenblas-deve`, depending on the package manager used. The issue will be fixed in a future ROCm release. See [GitHub issue #5639](https://github.com/ROCm/ROCm/issues/5639).
+
+### Reduced precision in gemm_ex operations for rocBLAS and hipBLAS
+
+Some `gemm_ex` operations with `half` or `f32_r` data types might yield 16-bit precision results instead of the expected 32-bit precision when matrix dimensions are m=1 or n=1. The issue results from the optimization that enables `_ex` APIs to use lower precision multiples. It limits the high-precision matrix operations performed in PyTorch with rocBLAS and hipBLAS. The issue will be fixed in a future ROCm release. See [GitHub issue #5640](https://github.com/ROCm/ROCm/issues/5640).
+
+### RCCL profiler plugin failure with AllToAll operations
+
+The RCCL profiler plugin `librccl-profiler.so` might fail with a segmentation fault during `AllToAll` collective operations due to improperly assigned point-to-point task function pointers. This leads to invalid memory access and prevents profiling of `AllToAll` performance. Other operations, like `AllReduce`, are unaffected. It's recommended to avoid using the RCCL profiler plugin with `AllToAll` operations until the fix is available. This issue is resolved in the {fab}`github`[RCCL `develop` branch](https://github.com/ROCm/rccl/tree/develop) and will be part of a future ROCm release. See [GitHub issue #5653](https://github.com/ROCm/ROCm/issues/5653).
+
 ## ROCm resolved issues

 The following are previously known issues resolved in this release. For resolved issues related to
--- a/default.xml
+++ b/default.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-7.1.0"
+    <default revision="refs/tags/rocm-7.1.1"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
@@ -25,6 +25,7 @@
    <project groups="mathlibs" name="MIVisionX" />
    <project groups="mathlibs" name="ROCmValidationSuite" />
    <project groups="mathlibs" name="composable_kernel" />
+    <project groups="mathlibs" name="hipSOLVER" />
    <project groups="mathlibs" name="hipTensor" />
    <project groups="mathlibs" name="hipfort" />
    <project groups="mathlibs" name="rccl" />
@@ -45,6 +46,7 @@
        rocprofiler rocr-runtime roctracer -->
    <project groups="mathlibs" name="rocm-systems" />
    <project groups="mathlibs" name="rocPyDecode" />
+    <project groups="mathlibs" name="rocSOLVER" />
    <project groups="mathlibs" name="rocSHMEM" />
    <project groups="mathlibs" name="rocWMMA" />
    <project groups="mathlibs" name="rocm-cmake" />
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -243,7 +243,7 @@ Expand for full historical view of:
   .. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
   .. [#verl_compat-past-60] verl is supported only on ROCm 6.2.0.
   .. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
-   .. [#dgl_compat-past-60] DGL is supported only on ROCm 6.4.0.
+   .. [#dgl_compat-past-60] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
   .. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
   .. [#taichi_compat-past-60] Taichi is supported only on ROCm 6.3.2.
   .. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -39,13 +39,13 @@ Support overview
 Version support
 --------------------------------------------------------------------------------

-DGL is supported on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+DGL is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__, 
+`ROCm 6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__, and `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.

 Supported devices
 --------------------------------------------------------------------------------

- **Officially Supported**: AMD Instinct™ MI300X (through `hipBLASlt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`__)
- **Partially Supported**: AMD Instinct™ MI250X
+**Officially Supported**: AMD Instinct™ MI300X, MI250X

 .. _dgl-recommendations:

@@ -60,16 +60,35 @@ GAT, GCN, and GraphSage. Using these models, a variety of use cases are supporte
 - 1D (Temporal) and 2D (Image) Classification
 - Drug Discovery

-Multiple use cases of DGL have been tested and verified.
-However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
-Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
-where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs. 
+For use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for DGL examples and best practices to optimize your workloads on AMD GPUs.

-Coverage includes:
+* Although multiple use cases of DGL have been tested and verified, a few have been  
+  outlined in the `DGL in the Real World: Running GNNs on Real Use Cases 
+  <https://rocm.blogs.amd.com/artificial-intelligence/dgl_blog2/README.html>`__ blog 
+  post, which walks through four real-world graph neural network (GNN) workloads 
+  implemented with the Deep Graph Library on ROCm. It covers tasks ranging from 
+  heterogeneous e-commerce graphs and multiplex networks (GATNE) to molecular graph 
+  regression (GNN-FiLM) and EEG-based neurological diagnosis (EEG-GCNN). For each use 
+  case, the authors detail: the dataset and task, how DGL is used, and their experience 
+  porting to ROCm. It is shown that DGL codebases often run without modification, with 
+  seamless integration of graph operations, message passing, sampling, and convolution. 

- Single-GPU training/inference
- Multi-GPU training
+* The `Graph Neural Networks (GNNs) at Scale: DGL with ROCm on AMD Hardware 
+  <https://rocm.blogs.amd.com/artificial-intelligence/why-graph-neural/README.html>`__ 
+  blog post introduces the Deep Graph Library (DGL) and its enablement on the AMD ROCm platform, 
+  bringing high-performance graph neural network (GNN) training to AMD GPUs. DGL bridges 
+  the gap between dense tensor frameworks and the irregular nature of graph data through a 
+  graph-first, message-passing abstraction. Its design ensures scalability, flexibility, and 
+  interoperability across frameworks like PyTorch and TensorFlow. AMD’s ROCm integration 
+  enables DGL to run efficiently on HIP-based GPUs, supported by prebuilt Docker containers 
+  and open-source repositories. This marks a major step in AMD's mission to advance open, 
+  scalable AI ecosystems beyond traditional architectures.

+You can pre-process datasets and begin training on AMD GPUs through:
+
+* Single-GPU training/inference
+* Multi-GPU training

 .. _dgl-docker-compat:

@@ -85,7 +104,7 @@ with ROCm backends on Docker Hub. The following Docker image tags and associated
 inventories represent the latest available DGL version from the official Docker Hub. 
 Click the |docker-icon| to view the image on Docker Hub.

-.. list-table:: DGL Docker image components
+.. list-table::
    :header-rows: 1
    :class: docker-image-compatibility

@@ -98,43 +117,83 @@ Click the |docker-icon| to view the image on Docker Hub.

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.8.0/images/sha256-943698ddf54c22a7bcad2e5b4ff467752e29e4ba6d0c926789ae7b242cbd92dd"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>

-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
+      - `2.8.0 <https://github.com/pytorch/pytorch/releases/tag/v2.8.0>`__
      - 24.04
      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-b2ec286a035eb7d0a6aab069561914d21a3cac462281e9c024501ba5ccedfbf7"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>

-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
+      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu22.04_py3.10_pytorch_2.7.1/images/sha256-d27aee16df922ccf0bcd9107bfcb6d20d34235445d456c637e33ca6f19d11a51"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.7.1 <https://github.com/pytorch/pytorch/releases/tag/v2.7.1>`__
+      - 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm6.4.3_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-f3ba6a3c9ec9f6c1cde28449dc9780e0c4c16c4140f4b23f158565fbfd422d6b"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+
+      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
      - 24.04
      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__


    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>

-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
+      - `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
      - 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__


    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>

-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
-      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
+      - `2.3.0 <https://github.com/pytorch/pytorch/releases/tag/v2.3.0>`__
      - 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
      
@@ -150,81 +209,102 @@ If you prefer to build it yourself, ensure the following dependencies are instal
    :header-rows: 1

    * - ROCm library
-      - ROCm 6.4.0 Version
+      - ROCm 7.0.0 Version
+      - ROCm 6.4.x Version
      - Purpose
    * - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
+      - 1.1.0
      - 1.1.0
      - Enables faster execution of core operations like matrix multiplication
        (GEMM), convolutions and transformations.
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
+      - 3.0.0
      - 2.4.0
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
+      - 1.0.0
      - 0.12.0
      - hipBLASLt is an extension of the hipBLAS library, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
+      - 4.0.0
      - 3.4.0
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
+      - 1.0.20
      - 1.0.18
      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
+      - 3.0.0
      - 2.12.0
      - Provides fast random number generation for GPUs.
    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
+      - 3.0.0
      - 2.4.0
      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
        singular value decompositions (SVD).
    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
+      - 4.0.1
      - 3.2.0
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
+      - 0.2.4
      - 0.2.3
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
    * - `hipTensor <https://github.com/ROCm/hipTensor>`_
+      - 2.0.0
      - 1.5.0
      - Optimizes for high-performance tensor operations, such as contractions.
    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
+      - 3.5.0
      - 3.4.0
      - Optimizes deep learning primitives such as convolutions, pooling,
        normalization, and activation functions.
    * - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
+      - 2.13.0
      - 2.12.0
      - Adds graph-level optimizations, ONNX models and mixed precision support
        and enable Ahead-of-Time (AOT) Compilation.
    * - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
+      - 3.3.0
      - 3.2.0
      - Optimizes acceleration for computer vision and AI workloads like
        preprocessing, augmentation, and inferencing.
    * - `rocAL <https://github.com/ROCm/rocAL>`_
-      - :version-ref:`rocAL rocm_version`
+      - 3.3.0
+      - 2.2.0
      - Accelerates the data pipeline by offloading intensive preprocessing and
        augmentation tasks. rocAL is part of MIVisionX.
    * - `RCCL <https://github.com/ROCm/rccl>`_
-      - 2.2.0
+      - 2.26.6
+      - 2.22.3
      - Optimizes for multi-GPU communication for operations like AllReduce and
        Broadcast.
    * - `rocDecode <https://github.com/ROCm/rocDecode>`_
+      - 1.0.0
      - 0.10.0
      - Provides hardware-accelerated data decoding capabilities, particularly
        for image, video, and other dataset formats.
    * - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
+      - 1.1.0
      - 0.8.0
      - Provides hardware-accelerated JPEG image decoding and encoding.
    * - `RPP <https://github.com/ROCm/RPP>`_
+      - 2.0.0
      - 1.9.10
      - Speeds up data augmentation, transformation, and other preprocessing steps.
    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
+      - 4.0.0
      - 3.3.0
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
+      - 2.0.0
      - 1.7.0
      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
        multiplication (GEMM) and accumulation operations with mixed precision
@@ -253,26 +333,29 @@ Instead of listing them all, support is grouped into the following categories to
 * DGL NN
 * DGL Optim
 * DGL Sparse
-
+* GraphBolt

 Unsupported features
 ================================================================================

-* GraphBolt
-* Partial TF32 Support (MI250X only)
+* TF32 Support (only supported for PyTorch 2.7 and above)
 * Kineto/ROCTracer integration


 Unsupported functions
 ================================================================================

-* ``more_nnz``
+* ``bfs``
 * ``format``
 * ``multiprocess_sparse_adam_state_dict``
-* ``record_stream_ndarray``
 * ``half_spmm``
 * ``segment_mm`` 
 * ``gather_mm_idx_b``
-* ``pgexplainer``
 * ``sample_labors_prob``
 * ``sample_labors_noprob``
+* ``sparse_admin``
+
+Previous versions
+===============================================================================
+See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/dgl-history` to find documentation for previous releases
+of the ``ROCm/dgl`` Docker image.
--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -45,7 +45,7 @@ llama.cpp is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
 Supported devices
 --------------------------------------------------------------------------------

-**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
+**Officially Supported**: AMD Instinct™ MI325X, MI300X, MI210

 Use cases and recommendations
 ================================================================================
@@ -109,27 +109,27 @@ Click |docker-icon| to view the image on Docker Hub.

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_full/images/sha256-a2ecd635eaa65bb289a9041330128677f3ae88bee6fee0597424b17e38d4903c"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_full/images/sha256-a94f0c7a598cc6504ff9e8371c016d7a2f93e69bf54a36c870f9522567201f10g"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_server/images/sha256-cb46b47df415addb5ceb6e6fdf0be70bf9d7f6863bbe6e10c2441ecb84246d52"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_server/images/sha256-be175932c3c96e882dfbc7e20e0e834f58c89c2925f48b222837ee929dfc47ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_light/images/sha256-8f8536eec4b05c0ff1c022f9fc6c527ad1c89e6c1ca0906e4d39e4de73edbde9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_light/images/sha256-d8ba0c70603da502c879b1f8010b439c8e7fa9f6cbdac8bbbbbba97cb41ebc9e"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
      - 24.04

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_full/images/sha256-f36de2a3b03ae53e81c85422cb3780368c9891e1ac7884b04403a921fe2ea45d"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_full/images/sha256-37582168984f25dce636cc7288298e06d94472ea35f65346b3541e6422b678ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_server/images/sha256-df15e8ab11a6837cd3736644fec1e047465d49e37d610ab0b79df000371327df"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_server/images/sha256-7e70578e6c3530c6591cc2c26da24a9ee68a20d318e12241de93c83224f83720"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_light/images/sha256-4ea2d5bb7964f0ee3ea9b30ba7f343edd6ddfab1b1037669ca7eafad2e3c2bd7"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_light/images/sha256-9a5231acf88b4a229677bc2c636ea3fe78a7a80f558bd80910b919855de93ad5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
      - 22.04

--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -84,6 +84,8 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-wheels-package>`__
+
      - .. raw:: html

          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 
--- a/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
@@ -67,7 +67,7 @@ Quick start examples:
   export VLLM_ROCM_USE_AITER=1
   vllm serve MODEL_NAME

-   # Enable only AITER Triton Prefill-Decode (split) attention
+   # Enable AITER Fused MoE and enable Triton Prefill-Decode (split) attention
   export VLLM_ROCM_USE_AITER=1
   export VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
   export VLLM_ROCM_USE_AITER_MHA=0
@@ -244,14 +244,17 @@ Most users won't need this, but you can override the defaults:
   * - AITER MHA (standard models)
     - ``VLLM_ROCM_USE_AITER=1`` (auto-selects for non-MLA models)

-   * - AITER Triton Prefill-Decode (split)
+   * - vLLM Triton Unified (default)
+     - ``VLLM_ROCM_USE_AITER=0`` (or unset)
+
+   * - Triton Prefill-Decode (split) without AITER
+     - | ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``
+
+   * - Triton Prefill-Decode (split) along with AITER Fused-MoE
     - | ``VLLM_ROCM_USE_AITER=1``
       | ``VLLM_ROCM_USE_AITER_MHA=0``
       | ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``

-   * - vLLM Triton Unified (default)
-     - ``VLLM_ROCM_USE_AITER=0`` (or unset)
-
   * - AITER Unified Attention
     - | ``VLLM_ROCM_USE_AITER=1``
       | ``VLLM_ROCM_USE_AITER_MHA=0``
@@ -269,11 +272,11 @@ Most users won't need this, but you can override the defaults:
       --block-size 1 \
       --tensor-parallel-size 8

-   # Advanced: Use Prefill-Decode split (for short input cases)
+   # Advanced: Use Prefill-Decode split (for short input cases) with AITER Fused-MoE
   VLLM_ROCM_USE_AITER=1 \
   VLLM_ROCM_USE_AITER_MHA=0 \
   VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 \
-   vllm serve meta-llama/Llama-3.3-70B-Instruct
+   vllm serve meta-llama/Llama-4-Scout-17B-16E

 **Which backend should I choose?**

@@ -352,14 +355,14 @@ vLLM V1 on ROCm provides these attention implementations:

 3. **AITER Triton Prefill–Decode Attention** (hybrid, Instinct MI300X-optimized)

-   * Enable with ``VLLM_ROCM_USE_AITER=1``, ``VLLM_ROCM_USE_AITER_MHA=0``, and ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``
+   * Enable with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``
   * Uses separate kernels for prefill and decode phases:

     * **Prefill**: ``context_attention_fwd`` Triton kernel
     * **Primary decode**: ``torch.ops._rocm_C.paged_attention`` (custom ROCm kernel optimized for head sizes 64/128, block sizes 16/32, GQA 1–16, context ≤131k; sliding window not supported)
     * **Fallback decode**: ``kernel_paged_attention_2d`` Triton kernel when shapes don't meet primary decode requirements

-   * Usually better compared to unified Triton kernels (both vLLM and AITER variants)
+   * Usually better compared to unified Triton kernels
   * Performance vs AITER MHA varies: AITER MHA is typically faster overall, but Prefill-Decode split may win in short input scenarios
   * The custom paged attention decode kernel is controlled by ``VLLM_ROCM_CUSTOM_PAGED_ATTN`` (default **True**)

@@ -695,7 +698,9 @@ There are two strategies:
 vLLM engine arguments
 =====================

-Selected arguments that often help on ROCm. See `engine args docs <https://docs.vllm.ai/en/latest/serving/engine_args.html>`_ for the full list.
+Selected arguments that often help on ROCm. See `Engine Arguments
+<https://docs.vllm.ai/en/stable/configuration/engine_args.html>`__ in the vLLM
+documentation for the full list.

 Configure --max-num-seqs
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
+++ b/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
@@ -22,7 +22,7 @@ See the `GitHub repository <https://github.com/vllm-project/vllm>`_ and `officia
 <https://docs.vllm.ai/>`_ for more information.

 For guidance on using vLLM with ROCm, refer to `Installation with ROCm
-<https://docs.vllm.ai/en/latest/getting_started/amd-installation.html>`_.
+<https://docs.vllm.ai/en/stable/getting_started/installation/gpu.html#amd-rocm>`__.

 vLLM installation
 -----------------
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,4 +1,4 @@
-rocm-docs-core==1.28.0
+rocm-docs-core==1.29.0
 sphinx-reredirects
 sphinx-sitemap
 sphinxcontrib.datatemplates==0.11.0
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -187,7 +187,7 @@ requests==2.32.5
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.28.0
+rocm-docs-core==1.29.0
    # via -r requirements.in
 rpds-py==0.28.0
    # via
--- a/tools/rocm-build/rocm-7.1.1.xml
+++ b/tools/rocm-build/rocm-7.1.1.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<manifest>
+    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
+    <default revision="refs/tags/rocm-7.1.1"
+     remote="rocm-org"
+     sync-c="true"
+     sync-j="4" />
+<!--list of projects for ROCm-->
+    <project name="ROCK-Kernel-Driver" />
+    <project name="amdsmi" />
+    <project name="rocm_bandwidth_test" />
+    <project name="rocm-examples" />
+<!--HIP Projects-->
+    <project name="HIPIFY" />
+<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
+    <project name="half" />
+    <project name="llvm-project" />
+    <project name="spirv-llvm-translator" />
+<!-- gdb projects -->
+    <project name="ROCdbgapi" />
+    <project name="ROCgdb" />
+    <project name="rocr_debug_agent" />
+<!-- ROCm Libraries -->
+    <project groups="mathlibs" name="AMDMIGraphX" />
+    <project groups="mathlibs" name="MIVisionX" />
+    <project groups="mathlibs" name="ROCmValidationSuite" />
+    <project groups="mathlibs" name="composable_kernel" />
+    <project groups="mathlibs" name="hipSOLVER" />
+    <project groups="mathlibs" name="hipTensor" />
+    <project groups="mathlibs" name="hipTensor" />
+    <project groups="mathlibs" name="hipfort" />
+    <project groups="mathlibs" name="rccl" />
+    <project groups="mathlibs" name="rocAL" />
+    <project groups="mathlibs" name="rocALUTION" />
+    <project groups="mathlibs" name="rocDecode" />
+    <project groups="mathlibs" name="rocJPEG" />
+    <!-- The following components have been migrated to rocm-libraries:
+        hipBLAS-common hipBLAS hipBLASLt hipCUB
+        hipFFT hipRAND hipSPARSE hipSPARSELt
+        MIOpen rocBLAS rocFFT rocPRIM rocRAND
+        rocSPARSE rocThrust Tensile -->
+    <project groups="mathlibs" name="rocm-libraries" />
+    <!-- The following components have been migrated to rocm-systems:
+        aqlprofile clr hip hip-tests hipother
+        rdc rocm-core rocm_smi_lib rocminfo rocprofiler-compute 
+        rocprofiler-register rocprofiler-sdk rocprofiler-systems 
+        rocprofiler rocr-runtime roctracer -->
+    <project groups="mathlibs" name="rocm-systems" />
+    <project groups="mathlibs" name="rocPyDecode" />
+    <project groups="mathlibs" name="rocSHMEM" />
+    <project groups="mathlibs" name="rocSOLVER" />
+    <project groups="mathlibs" name="rocWMMA" />
+    <project groups="mathlibs" name="rocm-cmake" />
+    <project groups="mathlibs" name="rpp" />
+    <project groups="mathlibs" name="TransferBench" />
+<!-- Projects for OpenMP-Extras -->
+    <project name="aomp" path="openmp-extras/aomp" />
+    <project name="aomp-extras" path="openmp-extras/aomp-extras" />
+    <project name="flang" path="openmp-extras/flang" />
+</manifest>