Update precision support page part I.

External CI: rocm-libraries superbuild component yaml (#5125 )
- Subset of the hipblaslt component yaml, deleting extra gpu targets and the testing component. - Sparse checkout details removed. - Basic build flags from top-level invocation added.
2026-01-11 07:38:17 -05:00 · 2025-07-31 14:21:17 +02:00 · 2025-07-30 17:50:46 -04:00 · 2025-07-30 16:05:26 -04:00 · 2025-07-30 12:03:51 -07:00 · 2025-07-30 13:12:11 -04:00
29 changed files with 1572 additions and 271 deletions
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: AMDMIGraphX
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+# - name: sparseCheckoutDir
+#   type: string
+#   default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -93,7 +112,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: AMDMIGraphX_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -121,6 +144,8 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -146,12 +171,12 @@ jobs:
        gpuTarget: ${{ job.target }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: AMDMIGraphX_test_${{ job.target }}
-    dependsOn: AMDMIGraphX_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_ubuntu2204_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
        eq(${{ parameters.aggregatePipeline }}, False)
      )
    variables:
@@ -183,6 +208,8 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: CMake@1
      displayName: MIGraphXTest CMake Flags
      inputs:
@@ -199,7 +226,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: AMDMIGraphX
+        componentName: ${{ parameters.componentName }}
        testExecutable: make
        testParameters: -j$(nproc) check
        testPublishResults: false
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: MIOpen
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -74,10 +93,31 @@ parameters:
        target: gfx942
      - gfx90a:
        target: gfx90a
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - MIVisionX:
+      name: MIVisionX
+      checkoutRepo: mivisionx_repo
+      sparseCheckoutDir: ''
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - MIOpen_build
+    - AMDMIGraphX:
+      name: AMDMIGraphX
+      checkoutRepo: amdmigraphx_repo
+      sparseCheckoutDir: ''
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - MIOpen_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: MIOpen_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -95,6 +135,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/miopen-get-ck-build.yml
      parameters:
        gpuTarget: ${{ job.target }}
@@ -104,11 +145,13 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Build and install other dependencies
      inputs:
        targetType: inline
-        workingDirectory: $(Build.SourcesDirectory)
+        workingDirectory: $(Agent.BuildDirectory)/s
        script: |
          sed -i '/composable_kernel/d' requirements.txt
          mkdir -p $(Agent.BuildDirectory)/miopen-deps
@@ -130,8 +173,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -143,9 +188,9 @@ jobs:
          - miopen-deps

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: MIOpen_test_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_ubuntu2204_${{ job.target }}
    timeoutInMinutes: 180
-    dependsOn: MIOpen_build_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -169,6 +214,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/miopen-get-ck-build.yml
      parameters:
@@ -178,11 +224,13 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Build and install other dependencies
      inputs:
        targetType: inline
-        workingDirectory: $(Build.SourcesDirectory)
+        workingDirectory: $(Agent.BuildDirectory)/s
        script: |
          sed -i '/composable_kernel/d' requirements.txt
          mkdir -p $(Agent.BuildDirectory)/miopen-deps
@@ -193,7 +241,7 @@ jobs:
      displayName: 'MIOpen Test CMake Flags'
      inputs:
        cmakeArgs: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Build.SourcesDirectory)/bin;$(Agent.BuildDirectory)/miopen-deps
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/s/bin;$(Agent.BuildDirectory)/miopen-deps
          -DCMAKE_INSTALL_PREFIX=$(Agent.BuildDirectory)/rocm
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
@@ -203,19 +251,19 @@ jobs:
          -DBUILD_DEV=OFF
          -DMIOPEN_USE_MLIR=ON
          -DMIOPEN_GPU_SYNC=OFF
-          ..
+          $(Agent.BuildDirectory)/s
    - task: Bash@3
      displayName: 'MIOpen Test Build'
      inputs:
        targetType: inline
+        workingDirectory: build
        script: |
          cmake --build . --target tests -- -j$(nproc)
-        workingDirectory: $(Build.SourcesDirectory)/build
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: MIOpen
-        testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml --exclude-regex "test_rnn_seq_api|GPU_Conv2dTuningAsm_FP32"'
+        componentName: ${{ parameters.componentName }}
+        testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml --exclude-regex "test_rnn_seq_api|GPU_Conv2dTuningAsm_FP32|GPU_Conv2dTuningAsmBwdWrw_FP32"'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
@@ -224,3 +272,15 @@ jobs:
        gpuTarget: ${{ job.target }}
        extraCopyDirectories:
          - miopen-deps
+
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ component.checkoutRepo }}
+#           # sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           buildDependsOn: ${{ component.buildDependsOn }}
+#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/MIVisionX.yml
+++ b/.azuredevops/components/MIVisionX.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: MIVisionX
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+# - name: sparseCheckoutDir
+#   type: string
+#   default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -60,6 +79,7 @@ parameters:
 - name: rocmTestDependencies
  type: object
  default:
+    - aomp
    - clr
    - half
    - hipBLAS-common
@@ -88,7 +108,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: MIVisionX_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -110,6 +134,8 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -131,12 +157,12 @@ jobs:
    #     gpuTarget: ${{ job.target }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: MIVisionX_test_${{ job.target }}
-    dependsOn: MIVisionX_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_ubuntu2204_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
        eq(${{ parameters.aggregatePipeline }}, False)
      )
    variables:
@@ -161,6 +187,8 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Build MIVisionX tests
      inputs:
@@ -174,7 +202,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: MIVisionX
+        componentName: ${{ parameters.componentName }}
        testDir: 'mivisionx-tests'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -28,8 +28,8 @@ parameters:
 - name: rocmTestDependencies
  type: object
  default:
+    - amdsmi
    - llvm-project
-    - rocm_smi_lib
    - rocprofiler-register

 - name: jobMatrix
@@ -111,14 +111,6 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
-    - task: Bash@3
-      displayName: Install libhwloc5
-      inputs:
-        targetType: 'inline'
-        script: |
-          wget http://ftp.us.debian.org/debian/pool/main/h/hwloc/libhwloc5_1.11.12-3_amd64.deb
-          wget http://ftp.us.debian.org/debian/pool/main/h/hwloc/libhwloc-dev_1.11.12-3_amd64.deb
-          sudo apt install -y --allow-downgrades ./libhwloc5_1.11.12-3_amd64.deb ./libhwloc-dev_1.11.12-3_amd64.deb
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
@@ -161,6 +153,10 @@ jobs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
        script: |
+          echo $(Build.SourcesDirectory)/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+          sudo cat /etc/ld.so.conf.d/rocm-ci.conf
+          sudo ldconfig -v
+          ldconfig -p
          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
            source /opt/rh/gcc-toolset-14/enable
          fi
--- a/.azuredevops/components/hipBLAS.yml
+++ b/.azuredevops/components/hipBLAS.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipBLAS
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -69,10 +88,30 @@ parameters:
        target: gfx942
      - gfx90a:
        target: gfx90a
+# MIOpen depends on both rocRAND and hipBLAS
+# for a unified build, hipBLAS will be the one to call MIOpen
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - MIOpen:
+      name: MIOpen
+      sparseCheckoutDir: projects/miopen
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - hipBLAS_build
+      unifiedBuild:
+        downstreamAggregateNames: hipBLAS+rocRAND
+        buildDependsOn:
+          - hipBLAS_build
+          - rocRAND_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipBLAS_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -88,6 +127,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -95,6 +135,8 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -109,9 +151,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -121,46 +166,67 @@ jobs:
        installAOCL: true
        gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipBLAS_test_${{ job.target }}
-    dependsOn: hipBLAS_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipBLAS
-        testExecutable: $(Agent.BuildDirectory)/rocm/bin/hipblas-test
-        testParameters: '--yaml hipblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_ubuntu2204_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testExecutable: $(Agent.BuildDirectory)/rocm/bin/hipblas-test
+          testParameters: '--yaml hipblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
+          ${{ if parameters.unifiedBuild }}:
+            buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
+          ${{ else }}:
+            buildDependsOn: ${{ component.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -104,17 +104,17 @@ parameters:
        - rocBLAS_build
    # rocSOLVER depends on both rocBLAS and rocPRIM
    # for a unified build, rocBLAS will be the one to call rocSOLVER
-    # - rocSOLVER:
-    #   name: rocSOLVER
-    #   sparseCheckoutDir: projects/rocsolver
-    #   skipUnifiedBuild: 'false'
-    #   buildDependsOn:
-    #     - rocBLAS_build
-    #   unifiedBuild:
-    #     downstreamAggregateNames: rocBLAS+rocPRIM
-    #     buildDependsOn:
-    #       - rocBLAS_build
-    #       - rocPRIM_build
+    - rocSOLVER:
+      name: rocSOLVER
+      sparseCheckoutDir: projects/rocsolver
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocBLAS_build
+      unifiedBuild:
+        downstreamAggregateNames: rocBLAS+rocPRIM
+        buildDependsOn:
+          - rocBLAS_build
+          - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -91,12 +91,12 @@ parameters:
        - rocPRIM_build
    # rocSOLVER depends on both rocBLAS and rocPRIM
    # for a unified build, rocBLAS will be the one to call rocSOLVER
-    # - rocSOLVER:
-    #   name: rocSOLVER
-    #   sparseCheckoutDir: projects/rocsolver
-    #   skipUnifiedBuild: 'true'
-    #   buildDependsOn:
-    #     - rocPRIM_build
+    - rocSOLVER:
+      name: rocSOLVER
+      sparseCheckoutDir: projects/rocsolver
+      skipUnifiedBuild: 'true'
+      buildDependsOn:
+        - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -79,6 +79,12 @@ parameters:
      skipUnifiedBuild: 'false'
      buildDependsOn:
        - rocRAND_build
+    - MIOpen:
+      name: MIOpen
+      sparseCheckoutDir: projects/miopen
+      skipUnifiedBuild: 'true'
+      buildDependsOn:
+        - rocRAND_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -83,6 +83,28 @@ parameters:
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipBLAS:
+      name: hipBLAS
+      sparseCheckoutDir: projects/hipblas
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocSOLVER_build
+    # hipSOLVER depends on both rocSOLVER and rocSPARSE
+    # for a unified build, rocSOLVER will be the one to call hipSOLVER
+    # - hipSOLVER:
+    #   name: hipSOLVER
+    #   sparseCheckoutDir: projects/hipsolver
+    #   skipUnifiedBuild: 'false'
+    #   buildDependsOn:
+    #     - rocSOLVER_build
+    #   unifiedBuild:
+    #     downstreamAggregateNames: rocSOLVER+rocSPARSE
+    #     buildDependsOn:
+    #       - rocSOLVER_build
+    #       - rocSPARSE_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
@@ -228,3 +250,19 @@ jobs:
          aptPackages: ${{ parameters.aptPackages }}
          environment: test
          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
+          ${{ if parameters.unifiedBuild }}:
+            buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
+          ${{ else }}:
+            buildDependsOn: ${{ component.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/components/rocm-libraries.yml
+++ b/.azuredevops/components/rocm-libraries.yml
@@ -0,0 +1,163 @@
+parameters:
+- name: componentName
+  type: string
+  default: rocm_libraries
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
+# set to true if doing full build of ROCm stack
+# and dependencies are pulled from same pipeline
+- name: aggregatePipeline
+  type: boolean
+  default: false
+- name: aptPackages
+  type: object
+  default:
+    - ccache
+    - gfortran
+    - git
+    - libdrm-dev
+    - libmsgpack-dev
+    - libnuma-dev
+    - ninja-build
+    - python3-pip
+    - python3-venv
+- name: pipModules
+  type: object
+  default:
+    - joblib
+    - "packaging>=22.0"
+    - --upgrade
+- name: rocmDependencies
+  type: object
+  default:
+    - aomp
+    - clr
+    - llvm-project
+    - rocminfo
+    - rocm-cmake
+    - rocm_smi_lib
+    - rocprofiler-register
+    - ROCR-Runtime
+    - roctracer
+- name: rocmTestDependencies
+  type: object
+  default:
+    - aomp
+    - clr
+    - llvm-project
+    - rocminfo
+    - rocm_smi_lib
+    - rocprofiler-register
+    - ROCR-Runtime
+    - roctracer
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { pool: rocm-ci_ultra_build_pool, os: ubuntu2204, packageManager: apt, target: gfx942 }
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    timeoutInMinutes: 300
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: DAY_STRING
+      value: $[format('{0:ddMMyyyy}', pipeline.startTime)]
+    pool: ${{ job.pool }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        gpuTarget: ${{ job.target }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+    - script: |
+        mkdir -p $(CCACHE_DIR)
+        echo "##vso[task.prependpath]/usr/lib/ccache"
+      displayName: Update path for ccache
+    - task: Cache@2
+      displayName: Ccache caching
+      inputs:
+        key: rocm-libraries | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        path: $(CCACHE_DIR)
+        restoreKeys: |
+          rocm-libraries | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING)
+          rocm-libraries | ${{ job.os }} | ${{ job.target }}
+          rocm-libraries | ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        extraBuildFlags: >-
+          -DROCM_LIBRARIES_SUPERBUILD=ON
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          gpuTarget: ${{ job.target }}
+          extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
+          installLatestCMake: true
+          extraCopyDirectories:
+            - deps
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -37,6 +37,7 @@ parameters:
    - libpfm4-dev
    - libtool
    - libopenmpi-dev
+    - libsqlite3-dev
    - m4
    - ninja-build
    - openmpi-bin
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -3,21 +3,21 @@ parameters:
 - name: jobList
  type: object
  default:
-    - { os: ubuntu2204, target: gfx942, source: staging }
-    - { os: ubuntu2204, target: gfx90a, source: staging }
-    - { os: ubuntu2204, target: gfx1201, source: staging }
-    - { os: ubuntu2204, target: gfx1100, source: staging }
-    - { os: ubuntu2204, target: gfx1030, source: staging }
-    - { os: ubuntu2404, target: gfx942, source: staging }
-    - { os: ubuntu2404, target: gfx90a, source: staging }
-    - { os: ubuntu2404, target: gfx1201, source: staging }
-    - { os: ubuntu2404, target: gfx1100, source: staging }
-    - { os: ubuntu2404, target: gfx1030, source: staging }
-    - { os: almalinux8, target: gfx942, source: staging }
-    - { os: almalinux8, target: gfx90a, source: staging }
-    - { os: almalinux8, target: gfx1201, source: staging }
-    - { os: almalinux8, target: gfx1100, source: staging }
-    - { os: almalinux8, target: gfx1030, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx942, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx90a, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx1201, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx1100, source: staging }
+    - { os: ubuntu2204, packageManager: apt, target: gfx1030, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx942, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx90a, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx1201, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx1100, source: staging }
+    - { os: ubuntu2404, packageManager: apt, target: gfx1030, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx942, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx90a, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx1201, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx1100, source: staging }
+    - { os: almalinux8, packageManager: dnf, target: gfx1030, source: staging }
 - name: rocmDependencies
  type: object
  default:
@@ -92,7 +92,8 @@ schedules:

 jobs:
 - ${{ each job in parameters.jobList }}:
-  - job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
+  - job: nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
+    timeoutInMinutes: 90
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -131,7 +132,7 @@ jobs:
        includeRootFolder: false
        archiveType: tar
        tarCompression: gz
-        archiveFile: $(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildNumber)_ubuntu2204_${{ job.target }}.tar.gz
+        archiveFile: $(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildNumber)_${{ job.os }}_${{ job.target }}.tar.gz
    - script: du -sh $(Build.ArtifactStagingDirectory)
      displayName: Compressed ROCm size
    - task: PublishPipelineArtifact@1
@@ -144,5 +145,95 @@ jobs:
      inputs:
        workingDirectory: $(Pipeline.Workspace)
        targetType: inline
-        script: echo "$(Build.DefinitionName)_$(Build.BuildNumber)_ubuntu2204_${{ job.target }}.tar.gz" >> pipelineArtifacts.txt
+        script: echo "$(Build.DefinitionName)_$(Build.BuildNumber)_${{ job.os }}_${{ job.target }}.tar.gz" >> pipelineArtifacts.txt
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - ${{ if eq(job.packageManager, 'apt') }}:
+      - task: Bash@3
+        displayName: Create Dockerfile
+        inputs:
+          workingDirectory: $(Agent.BuildDirectory)
+          targetType: inline
+          script: |
+            cat <<'EOF' > Dockerfile
+              ${{ iif(eq(job.os, 'ubuntu2204'), 'FROM ubuntu:22.04', '') }}
+              ${{ iif(eq(job.os, 'ubuntu2404'), 'FROM ubuntu:24.04', '') }}
+
+              WORKDIR /root
+              RUN mkdir rocm
+
+              RUN apt update \
+                && apt upgrade -y \
+                && apt install -y cmake curl git gcc g++ gpg lsb-release lsof ninja-build pkg-config python3 python3-pip wget zip libdrm-dev libelf-dev libgtest-dev libhsakmt-dev libhwloc-dev libnuma-dev libstdc++-12-dev libtbb-dev jq \
+                && apt clean all
+
+              RUN PACKAGE_NAME=$(curl -s https://repo.radeon.com/rocm/apt/latest/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb") \
+                && wget -nv --retry-connrefused https://repo.radeon.com/rocm/apt/latest/pool/main/h/hsa-amd-aqlprofile/$PACKAGE_NAME \
+                && mkdir hsa-amd-aqlprofile \
+                && dpkg-deb -R $PACKAGE_NAME hsa-amd-aqlprofile \
+                && cp -R hsa-amd-aqlprofile/opt/rocm-*/* rocm
+
+              RUN ARTIFACT_URL="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis/build/builds/$(Build.BuildId)/artifacts?artifactName=nightly${{ job.os }}${{ job.target }}${{ job.source }}&api-version=7.1" \
+                && DOWNLOAD_URL=$(curl -s $ARTIFACT_URL | jq ".resource.downloadUrl" | tr -d '"') \
+                && wget -nv --retry-connrefused $DOWNLOAD_URL -O nightly.zip \
+                && unzip nightly.zip \
+                && tar -xf nightly${{ job.os }}${{ job.target }}${{ job.source }}/rocm-nightly*${{ job.os }}*${{ job.target }}*.tar.gz -C rocm
+
+              RUN echo /root/rocm/lib | tee /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/llvm/lib | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/lib64 | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/llvm/lib64 | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN ldconfig -v
+              ENV PATH="$PATH:/root/rocm/bin"
+              ENTRYPOINT ["/bin/bash"]
+            EOF
+            cat Dockerfile
+    - ${{ elseif eq(job.packageManager, 'dnf') }}:
+      - task: Bash@3
+        displayName: Create Dockerfile
+        inputs:
+          workingDirectory: $(Agent.BuildDirectory)
+          targetType: inline
+          script: |
+            cat <<'EOF' > Dockerfile
+              ${{ iif(eq(job.os, 'almalinux8'), 'FROM almalinux:8', '') }}
+
+              WORKDIR /root
+              RUN mkdir rocm
+
+              RUN dnf install -y cmake curl git gcc gcc-c++ gnupg2 redhat-lsb-core lsof pkgconf python3 python3-pip wget zip libdrm-devel elfutils-libelf-devel numactl-devel libstdc++-devel tbb-devel jq \
+                && dnf clean all
+
+              RUN PACKAGE_NAME=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1) \
+                && wget -nv --retry-connrefused https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$PACKAGE_NAME \
+                && mkdir hsa-amd-aqlprofile \
+                && dnf -y install rpm-build cpio \
+                && rpm2cpio $PACKAGE_NAME | (cd hsa-amd-aqlprofile && cpio -idmv) \
+                && cp -R hsa-amd-aqlprofile/opt/rocm-*/* rocm
+
+              RUN ARTIFACT_URL="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis/build/builds/$(Build.BuildId)/artifacts?artifactName=nightly${{ job.os }}${{ job.target }}${{ job.source }}&api-version=7.1" \
+                && DOWNLOAD_URL=$(curl -s $ARTIFACT_URL | jq ".resource.downloadUrl" | tr -d '"') \
+                && wget -nv --retry-connrefused $DOWNLOAD_URL -O nightly.zip \
+                && UNZIP_DISABLE_ZIPBOMB_DETECTION=TRUE unzip nightly.zip \
+                && tar -xf nightly${{ job.os }}${{ job.target }}${{ job.source }}/rocm-nightly*${{ job.os }}*${{ job.target }}*.tar.gz -C rocm
+
+              RUN echo /root/rocm/lib | tee /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/llvm/lib | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/lib64 | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN echo /root/rocm/llvm/lib64 | tee -a /etc/ld.so.conf.d/rocm-ci.conf
+              RUN ldconfig -v
+              ENV PATH="$PATH:/root/rocm/bin"
+              ENTRYPOINT ["/bin/bash"]
+            EOF
+            cat Dockerfile
+    - task: Docker@2
+      displayName: Build and upload Docker image
+      inputs:
+        containerRegistry: ContainerService3
+        repository: 'nightly-${{ job.os }}-${{ job.target }}-${{ job.source }}'
+        Dockerfile: '$(Agent.BuildDirectory)/Dockerfile'
+        buildContext: '$(Agent.BuildDirectory)'
+    - task: Bash@3
+      displayName: '!! Docker Run Command !!'
+      inputs:
+        targetType: inline
+        script: echo "docker run -it --network=host --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined rocmexternalcicd.azurecr.io/nightly-${{ job.os }}-${{ job.target }}-${{ job.source }}:$(Build.BuildId)" | tr '[:upper:]' '[:lower:]'
--- a/.azuredevops/templates/steps/miopen-get-ck-build.yml
+++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml
@@ -13,7 +13,7 @@ steps:
    CC: $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
  inputs:
    targetType: inline
-    workingDirectory: $(Build.SourcesDirectory)
+    workingDirectory: $(Agent.BuildDirectory)/s
    script: |
      AZ_API="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis"
      GH_API="https://api.github.com/repos/ROCm"
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -32,13 +32,13 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.1
+  value: 6.4.2
 - name: REPO_RADEON_VERSION
-  value: 6.4.1
+  value: 6.4.2
 - name: NEXT_RELEASE_VERSION
  value: 7.0.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.1
+  value: rocm-6.4.2
 - name: DOCKER_SKIP_GFX
  value: gfx90a
 - name: AMDMIGRAPHX_PIPELINE_ID
@@ -68,7 +68,7 @@ variables:
 - name: HIPBLAS_COMMON_PIPELINE_ID
  value: 300
 - name: HIPBLAS_PIPELINE_ID
-  value: 87
+  value: 317
 - name: HIPBLASLT_PIPELINE_ID
  value: 301
 - name: HIPCUB_PIPELINE_ID
@@ -84,7 +84,7 @@ variables:
 - name: HIPSOLVER_PIPELINE_ID
  value: 84
 - name: HIPSPARSE_PIPELINE_ID
-  value: 83
+  value: 315
 - name: HIPSPARSELT_PIPELINE_ID
  value: 309
 - name: HIPTENSOR_PIPELINE_ID
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -176,6 +176,7 @@ HBM
 HCA
 HGX
 HIPCC
+hipDataType
 HIPExtension
 HIPIFY
 HIPification
@@ -408,6 +409,7 @@ SDMA
 SDPA
 SDRAM
 SENDMSG
+SGLang
 SGPR
 SGPRs
 SHA
@@ -863,6 +865,7 @@ seealso
 sendmsg
 seqs
 serializers
+sglang
 shader
 sharding
 sigmoid
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -42,16 +42,16 @@ GAT, GCN and GraphSage. Using these we can support a variety of use-cases such a
 - 1D (Temporal) and 2D (Image) Classification
 - Drug Discovery

-Refer to :doc:`ROCm DGL blog posts <https://rocm.blogs.amd.com/blog/tag/dgl.html>` 
-for examples and best practices to optimize your training workflows on AMD GPUs. 
+Multiple use cases of DGL have been tested and verified.
+However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
+Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
+where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs. 

 Coverage includes:

 - Single-GPU training/inference
 - Multi-GPU training

-Benchmarking details are included in the :doc:`Benchmarks` section.
-

 .. _dgl-docker-compat:

@@ -252,4 +252,4 @@ Unsupported functions
 * ``gather_mm_idx_b``
 * ``pgexplainer``
 * ``sample_labors_prob``
-* ``sample_labors_noprob``
+* ``sample_labors_noprob``
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -1,6 +1,6 @@
 pytorch_inference_benchmark:
  unified_docker:
-    latest: &rocm-pytorch-docker-latest
+    latest:
      pull_tag: rocm/pytorch:latest
      docker_hub_url:
      rocm_version:
@@ -39,3 +39,11 @@ pytorch_inference_benchmark:
        model_repo: Wan-AI/Wan2.1-T2V-14B
        url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
        precision: bfloat16
+    - group: Janus-Pro
+      tag: janus-pro
+      models:
+      - model: Janus Pro 7B
+        mad_tag: pyt_janus_pro_inference
+        model_repo: deepseek-ai/Janus-Pro-7B
+        url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
+        precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
@@ -0,0 +1,17 @@
+sglang_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: lmsysorg/sglang:v0.4.5-rocm630
+      docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
+      rocm_version: 6.3.0
+      sglang_version: 0.4.5 (0.4.5-rocm)
+      pytorch_version: 2.6.0a0+git8d4926e
+  model_groups:
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek-R1-Distill-Qwen-32B
+        mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
+        model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+        precision: bfloat16
--- a/docs/data/rocm-software-stack-6_4_0.jpg
+++ b/docs/data/rocm-software-stack-6_4_0.jpg
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history.rst
@@ -0,0 +1,25 @@
+:orphan:
+
+****************************************************
+SGLang inference performance testing version history
+****************************************************
+
+This table lists previous versions of the ROCm SGLang inference performance
+testing environment. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Docker image tag
+     - Components
+     - Resources
+
+   * - ``lmsysorg/sglang:v0.4.5-rocm630``
+     - 
+       * ROCm 6.3.0
+       * SGLang 0.4.5
+       * PyTorch 2.6.0
+     - 
+       * :doc:`Documentation <../sglang>`
+       * `Docker Hub <https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951>`__
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -103,7 +103,7 @@ PyTorch inference performance testing

         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.

-   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference
+   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference

      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.

@@ -140,22 +140,27 @@ PyTorch inference performance testing
      .. code-block:: shell

         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-         python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+         madengine run \
+             --tags {{model.mad_tag}} \
+             --keep-model-dir \
+             --live-output \
+             --timeout 28800

      MAD launches a Docker container with the name
      ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-      model are collected in ``perf.csv``.
+      model are collected in ``perf_{{model.mad_tag}}.csv``.

+      {% if model.mad_tag != "pyt_janus_pro_inference" %}
      .. note::

         For improved performance, consider enabling TunableOp. By default,
         ``{{model.mad_tag}}`` runs with TunableOp disabled (see
         `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable
-         it, edit the default run behavior in the ``tools/run_models.py``-- update the model's
-         run ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+         it, include the ``--tunableop on`` argument in your run.

         Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
         Although this might increase the initial training time, it can result in a performance gain.
+      {% endif %}

      {% endfor %}
   {% endfor %}
@@ -163,8 +168,10 @@ PyTorch inference performance testing
 Further reading
 ===============

+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
@@ -0,0 +1,280 @@
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
+   :keywords: model, MAD, automation, dashboarding, validate
+
+************************************
+SGLang inference performance testing
+************************************
+
+.. _sglang-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+
+   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
+
+   `SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
+   serving engine for large language models (LLMs) and vision models. The
+   ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
+   bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
+   accelerators. It includes the following software components:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `SGLang <https://docs.sglang.ai/index.html>`__
+        - {{ unified_docker.sglang_version }} 
+
+      * - `PyTorch <https://github.com/pytorch/pytorch>`__
+        - {{ unified_docker.pytorch_version }} 
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+
+   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
+   {% set model_groups = data.sglang_benchmark.model_groups %}
+
+   Pull the Docker image
+   =====================
+
+   Download the `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose one of the following methods to benchmark inference performance with
+   `DeepSeek-R1-Distill-Qwen-32B <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B>`__.
+
+   .. _sglang-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf_DeepSeek-R1-Distill-Qwen-32B.csv``.
+
+            Although the DeepSeek-R1-Distill-Qwen-32B is preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            1. Run the SGLang benchmark script independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`__
+               as shown in the following snippet.
+
+               .. code-block:: shell
+
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}
+
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/sglang``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/sglang
+
+            3. To start the benchmark, use the following command with the appropriate options.
+
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``$test_option``
+                       - latency
+                       - Measure decoding token latency
+
+                     * -
+                       - throughput
+                       - Measure token generation throughput
+
+                     * -
+                       - all
+                       - Measure both throughput and latency
+
+                     * - ``$num_gpu``
+                       - 8
+                       - Number of GPUs
+
+                     * - ``$datatype``
+                       - ``bfloat16``
+                       - Data type
+
+                     * - ``$dataset``
+                       - random
+                       - Dataset
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               Command:
+
+               .. code-block:: shell
+
+                  ./sglang_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d $datatype [-a $dataset]
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block:: shell-session
+
+                  OSError: You are trying to access a gated repo.
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./sglang_benchmark_report.sh \
+                     -s latency \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./sglang_benchmark_report.sh \
+                     -s throughput \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}} \
+                     -a random
+
+              Find the throughput report at ``./reports_{{model.precision}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/sgl-project/sglang/tree/main/benchmark/blog_v0_2>`__.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`previous-versions/sglang-history` to find documentation for previous releases
+of SGLang inference performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -202,7 +202,7 @@ system's configuration.
               .. code-block:: shell

                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  python3 tools/run_models.py \
+                  madengine run \
                      --tags {{model.mad_tag}} \
                      --keep-model-dir \
                      --live-output \
@@ -226,12 +226,12 @@ system's configuration.

               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
-               enable it, edit the default run behavior in the ``models.json``
-               configuration before running inference -- update the model's run
-               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
+               To enable it, include the ``--tunableop on`` argument in your
+               run.

-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed
+               by the performance-collection run.

            {% endif %}

@@ -419,8 +419,10 @@ Further reading
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -24,4 +24,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram

 - :doc:`PyTorch inference performance testing <benchmark-docker/pytorch-inference>`

+- :doc:`SGLang inference performance testing <benchmark-docker/sglang>`
+
 - :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -24,12 +24,13 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
 If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
 `Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.

-ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install/install-overview>`:
+You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
+distribution's package manager. See the following documentation resources to get started:
+
+* :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`

 * :doc:`Using your Linux distribution's package manager <rocm-install-on-linux:install/install-methods/package-manager-index>`

-* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/install-methods/amdgpu-installer-index>`
-
 * :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`

 .. grid:: 1
@@ -59,6 +60,12 @@ images with the framework pre-installed.

 * :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`

+* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
+
+* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
+
+* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
+
 Next steps
 ==========

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
@@ -73,7 +73,11 @@ document are not validated.

      .. code-block:: shell

-         python3 tools/run_models.py --tags pyt_mpt30b_training --keep-model-dir --live-output --clean-docker-cache
+         madengine run \
+             --tags pyt_mpt30b_training \
+             --keep-model-dir \
+             --live-output \
+             --clean-docker-cache

      .. tip::

@@ -90,7 +94,7 @@ document are not validated.

         For improved performance (training throughput), consider enabling TunableOp.
         By default, ``pyt_mpt30b_training`` runs with TunableOp disabled. To enable it,
-         run ``tools/run_models.py`` with the ``--tunableop on`` argument or edit the
+         run ``madengine run`` with the ``--tunableop on`` argument or edit the
         ``models.json`` configuration before running training.

         Although this might increase the initial training time, it can result in a performance gain.
@@ -172,4 +176,13 @@ Key performance metrics include:

    Overall training loss. A decreasing trend indicates the model is learning effectively.

+Further reading
+===============

+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -142,7 +142,11 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
            .. code-block:: shell

               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               python3 tools/run_models.py --tags {{ model.mad_tag }} --keep-model-dir --live-output --timeout 28800
+               madengine run \
+                   --tags {{ model.mad_tag }} \
+                   --keep-model-dir \
+                   --live-output \
+                   --timeout 28800

            MAD launches a Docker container with the name
            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
@@ -427,6 +431,17 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.

+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
 Previous versions
 =================

--- a/docs/reference/precision-support.rst
+++ b/docs/reference/precision-support.rst
@@ -55,7 +55,7 @@ The floating-point types supported by ROCm are listed in the following table.

 .. list-table::
    :header-rows: 1
-    :widths: 15,15,70
+    :widths: 15,25,60

    *
      - Type name
@@ -63,18 +63,19 @@ The floating-point types supported by ROCm are listed in the following table.
      - Description
    *
      - float8 (E4M3)
-      - ``__hip_fp8_e4m3_fnuz``
-      - An 8-bit floating-point number that mostly follows IEEE-754 conventions
-        and **S1E4M3** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_,
-        with expanded range and no infinity or signed zero. NaN is represented
-        as negative zero.
+      - | ``__hip_fp8_e4m3_fnuz``,
+        | ``__hip_fp8_e4m3``
+      - An 8-bit floating-point number with **S1E4M3** bit layout, as described in :doc:`low precision floating point types page <hip:reference/low_fp_types>`.
+        The FNUZ variant has expanded range with no infinity or signed zero (NaN represented as negative zero),
+        while the OCP variant follows the Open Compute Project specification.
    *
      - float8 (E5M2)
-      - ``__hip_fp8_e5m2_fnuz``
-      - An 8-bit floating-point number mostly following IEEE-754 conventions and
-        **S1E5M2** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_,
-        with expanded range and no infinity or signed zero. NaN is represented
-        as negative zero.
+      - | ``__hip_fp8_e5m2_fnuz``,
+        | ``__hip_fp8_e5m2``
+      - An 8-bit floating-point number with **S1E5M2** bit layout, as described in :doc:`low precision floating point types page <hip:reference/low_fp_types>`.
+        The FNUZ variant has expanded range with no infinity or signed zero (NaN represented as negative zero),
+        while the OCP variant follows the Open Compute Project specification.
+
    *
      - float16
      - ``half``
@@ -107,9 +108,8 @@ The floating-point types supported by ROCm are listed in the following table.
  * The float8 and tensorfloat32 types are internal types used in calculations
    in Matrix Cores and can be stored in any type of the same size.

-  * The encodings for FP8 (E5M2) and FP8 (E4M3) that the
-    MI300 series natively supports differ from the FP8 (E5M2) and FP8 (E4M3)
-    encodings used in NVIDIA H100
+  * CNDA3 natively supports FP8 FNUZ (E4M3 and E5M2), which differs from the customised
+    FP8 format used in NVIDIA's H100
    (`FP8 Formats for Deep Learning <https://arxiv.org/abs/2209.05433>`_).

  * In some AMD documents and articles, float8 (E5M2) is referred to as bfloat8.
@@ -128,7 +128,7 @@ pages.
    :header-rows: 1

    *
-      -  Icon
+      - Icon
      - Definition

    *
@@ -163,12 +163,137 @@ pages.
  * Any type can be emulated by software, but this page does not cover such
    cases.

-Data type support by Hardware Architecture
+Data type support by hardware architecture
 ==========================================

-The MI200 series GPUs, which include MI210, MI250, and MI250X, are based on the
-CDNA2 architecture. The MI300 series GPUs, consisting of MI300A, MI300X, and
-MI325X, are based on the CDNA3 architecture.
+AMD's GPU lineup spans multiple architecture generations:
+
+* CDNA1 architecture: includes models such as MI100
+* CDNA2 architecture: includes models such as MI210, MI250, and MI250X
+* CDNA3 architecture: includes models such as MI300A, MI300X, and MI325X
+* RDNA3 architecture: includes models such as RX 7900XT and RX 7900XTX
+* RDNA4 architecture: includes models such as RX 9070 and RX 9070XT
+
+HIP C++ type implementation support
+-----------------------------------
+
+The HIP C++ types available on different hardware platforms are listed in the
+following table.
+
+.. list-table::
+    :header-rows: 1
+
+    *
+      - HIP C++ Type
+      - CDNA1
+      - CDNA2
+      - CDNA3
+      - RDNA3
+      - RDNA4
+
+    *
+      - ``int8_t``, ``uint8_t``
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+
+    *
+      - ``int16_t``, ``uint16_t``
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+
+    *
+      - ``int32_t``, ``uint32_t``
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+
+    *
+      - ``int64_t``, ``uint64_t``
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+
+    *
+      - ``__hip_fp8_e4m3_fnuz``
+      - ❌
+      - ❌
+      - ✅
+      - ❌
+      - ❌
+
+    *
+      - ``__hip_fp8_e5m2_fnuz``
+      - ❌
+      - ❌
+      - ✅
+      - ❌
+      - ❌
+
+    *
+      - ``__hip_fp8_e4m3``
+      - ❌
+      - ❌
+      - ❌
+      - ❌
+      - ✅
+
+    *
+      - ``__hip_fp8_e5m2``
+      - ❌
+      - ❌
+      - ❌
+      - ❌
+      - ✅
+
+    *
+      - ``half``
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+
+    *
+      - ``bfloat16``
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+
+    *
+      - ``float``
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+
+    *
+      - ``double``
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+      - ✅
+
+.. note::
+
+  Library support for specific data types is contingent upon hardware support.
+  Even if a ROCm library indicates support for a particular data type, that type
+  will only be fully functional if the underlying hardware architecture (as shown
+  in the table above) also supports it. For example, fp8 types are only available
+  on architectures shown with a checkmark in the relevant rows.

 Compute units support
 ---------------------
@@ -190,19 +315,33 @@ The following table lists data type support for compute units.
        - int32
        - int64
      *
-        - MI100
+        - CDNA1
        - ✅
        - ✅
        - ✅
        - ✅
      *
-        - MI200 series
+        - CDNA2
        - ✅
        - ✅
        - ✅
        - ✅
      *
-        - MI300 series
+        - CDNA3
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+
+      *
+        - RDNA3
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+
+      *
+        - RDNA4
        - ✅
        - ✅
        - ✅
@@ -224,7 +363,7 @@ The following table lists data type support for compute units.
        - float32
        - float64
      *
-        - MI100
+        - CDNA1
        - ❌
        - ❌
        - ✅
@@ -233,7 +372,7 @@ The following table lists data type support for compute units.
        - ✅
        - ✅
      *
-        - MI200 series
+        - CDNA2
        - ❌
        - ❌
        - ✅
@@ -242,7 +381,27 @@ The following table lists data type support for compute units.
        - ✅
        - ✅
      *
-        - MI300 series
+        - CDNA3
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ❌
+        - ✅
+        - ✅
+
+      *
+        - RDNA3
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ❌
+        - ✅
+        - ✅
+
+      *
+        - RDNA4
        - ❌
        - ❌
        - ✅
@@ -271,19 +430,33 @@ The following table lists data type support for AMD GPU matrix cores.
        - int32
        - int64
      *
-        - MI100
+        - CDNA1
        - ✅
        - ❌
        - ❌
        - ❌
      *
-        - MI200 series
+        - CDNA2
        - ✅
        - ❌
        - ❌
        - ❌
      *
-        - MI300 series
+        - CDNA3
+        - ✅
+        - ❌
+        - ❌
+        - ❌
+
+      *
+        - RDNA3
+        - ✅
+        - ❌
+        - ❌
+        - ❌
+
+      *
+        - RDNA4
        - ✅
        - ❌
        - ❌
@@ -305,7 +478,7 @@ The following table lists data type support for AMD GPU matrix cores.
        - float32
        - float64
      *
-        - MI100
+        - CDNA1
        - ❌
        - ❌
        - ✅
@@ -314,7 +487,7 @@ The following table lists data type support for AMD GPU matrix cores.
        - ✅
        - ❌
      *
-        - MI200 series
+        - CDNA2
        - ❌
        - ❌
        - ✅
@@ -323,7 +496,7 @@ The following table lists data type support for AMD GPU matrix cores.
        - ✅
        - ✅
      *
-        - MI300 series
+        - CDNA3
        - ✅
        - ✅
        - ✅
@@ -332,6 +505,26 @@ The following table lists data type support for AMD GPU matrix cores.
        - ✅
        - ✅

+      *
+        - RDNA3
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ❌
+        - ❌
+        - ❌
+
+      *
+        - RDNA4
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ❌
+        - ❌
+        - ❌
+
 Atomic operations support
 -------------------------

@@ -357,19 +550,33 @@ page.
        - int32
        - int64
      *
-        - MI100
+        - CDNA1
        - ❌
        - ❌
        - ✅
        - ✅
      *
-        - MI200 series
+        - CDNA2
        - ❌
        - ❌
        - ✅
        - ✅
      *
-        - MI300 series
+        - CDNA3
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+
+      *
+        - RDNA3
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+
+      *
+        - RDNA4
        - ❌
        - ❌
        - ✅
@@ -391,7 +598,7 @@ page.
        - float32
        - float64
      *
-        - MI100
+        - CDNA1
        - ❌
        - ❌
        - ✅
@@ -400,7 +607,7 @@ page.
        - ✅
        - ❌
      *
-        - MI200 series
+        - CDNA2
        - ❌
        - ❌
        - ✅
@@ -409,7 +616,7 @@ page.
        - ✅
        - ✅
      *
-        - MI300 series
+        - CDNA3
        - ❌
        - ❌
        - ✅
@@ -418,6 +625,26 @@ page.
        - ✅
        - ✅

+      *
+        - RDNA3
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ✅
+        - ❌
+
+      *
+        - RDNA4
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ❌
+        - ✅
+        - ❌
+
 .. note::

  You can emulate atomic operations using software for cases that are not
@@ -452,36 +679,98 @@ detailed description.
        - int16
        - int32
        - int64
+
      *
-        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
+        - :doc:`Composable Kernel <composable_kernel:reference/Composable_Kernel_supported_scalar_types>`
        - ✅/✅
        - ❌/❌
+        - ✅/✅
        - ❌/❌
-        - ❌/❌
-      *
-        - :doc:`rocRAND <rocrand:api-reference/data-type-support>`
-        - NA/✅
-        - NA/✅
-        - NA/✅
-        - NA/✅
-      *
-        - :doc:`hipRAND <hiprand:api-reference/data-type-support>`
-        - NA/✅
-        - NA/✅
-        - NA/✅
-        - NA/✅
-      *
-        -  :doc:`rocPRIM <rocprim:reference/data-type-support>`
-        - ✅/✅
-        - ✅/✅
-        - ✅/✅
-        - ✅/✅
+
      *
        - :doc:`hipCUB <hipcub:api-reference/data-type-support>`
        - ✅/✅
        - ✅/✅
        - ✅/✅
        - ✅/✅
+
+      *
+        - :doc:`hipRAND <hiprand:api-reference/data-type-support>`
+        - NA/✅
+        - NA/✅
+        - NA/✅
+        - NA/✅
+
+      *
+        - :doc:`hipSOLVER <hipsolver:reference/precision>`
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+
+      *
+        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
+        - ✅/✅
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+
+      *
+        - :doc:`hipTensor <hiptensor:api-reference/api-reference>`
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+
+      *
+        - :doc:`MIGraphX <amdmigraphx:reference/cpp>`
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+
+      *
+        - :doc:`MIOpen <miopen:reference/datatypes>`
+        - ⚠️/⚠️
+        - ❌/❌
+        - ⚠️/⚠️
+        - ❌/❌
+
+      *
+        - :doc:`RCCL <rccl:api-reference/library-specification>`
+        - ✅/✅
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+
+      *
+        - :doc:`rocFFT <rocfft:reference/api>`
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+
+      *
+        -  :doc:`rocPRIM <rocprim:reference/data-type-support>`
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+
+      *
+        - :doc:`rocRAND <rocrand:api-reference/data-type-support>`
+        - NA/✅
+        - NA/✅
+        - NA/✅
+        - NA/✅
+
+      *
+        - :doc:`rocSOLVER <rocsolver:reference/precision>`
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+
      *
        - :doc:`rocThrust <rocthrust:data-type-support>`
        - ✅/✅
@@ -489,6 +778,14 @@ detailed description.
        - ✅/✅
        - ✅/✅

+      *
+        - :doc:`rocWMMA <rocwmma:api-reference/api-reference-guide>`
+        - ✅/✅
+        - ❌/❌
+        - ❌/✅
+        - ❌/❌
+
+
  .. tab-item:: Floating-point types
    :sync: floating-point-type

@@ -504,42 +801,17 @@ detailed description.
        - tensorfloat32
        - float32
        - float64
+
      *
-        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
-        - ❌/❌
-        - ❌/❌
+        - :doc:`Composable Kernel <composable_kernel:reference/Composable_Kernel_supported_scalar_types>`
        - ✅/✅
        - ✅/✅
-        - ❌/❌
-        - ❌/❌
-        - ❌/❌
-      *
-        - :doc:`rocRAND <rocrand:api-reference/data-type-support>`
-        - NA/❌
-        - NA/❌
-        - NA/✅
-        - NA/❌
-        - NA/❌
-        - NA/✅
-        - NA/✅
-      *
-        - :doc:`hipRAND <hiprand:api-reference/data-type-support>`
-        - NA/❌
-        - NA/❌
-        - NA/✅
-        - NA/❌
-        - NA/❌
-        - NA/✅
-        - NA/✅
-      *
-        - :doc:`rocPRIM <rocprim:reference/data-type-support>`
-        - ❌/❌
-        - ❌/❌
        - ✅/✅
        - ✅/✅
        - ❌/❌
        - ✅/✅
        - ✅/✅
+
      *
        - :doc:`hipCUB <hipcub:api-reference/data-type-support>`
        - ❌/❌
@@ -549,6 +821,117 @@ detailed description.
        - ❌/❌
        - ✅/✅
        - ✅/✅
+
+      *
+        - :doc:`hipRAND <hiprand:api-reference/data-type-support>`
+        - NA/❌
+        - NA/❌
+        - NA/✅
+        - NA/❌
+        - NA/❌
+        - NA/✅
+        - NA/✅
+
+      *
+        - :doc:`hipSOLVER <hipsolver:reference/precision>`
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+
+      *
+        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+
+      *
+        - :doc:`hipTensor <hiptensor:api-reference/api-reference>`
+        - ❌/❌
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+
+      *
+        - :doc:`MIGraphX <amdmigraphx:reference/cpp>`
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+
+      *
+        - :doc:`MIOpen <miopen:reference/datatypes>`
+        - ⚠️/⚠️
+        - ⚠️/⚠️
+        - ✅/✅
+        - ⚠️/⚠️
+        - ❌/❌
+        - ✅/✅
+        - ⚠️/⚠️
+
+      *
+        - :doc:`RCCL <rccl:api-reference/library-specification>`
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+
+      *
+        - :doc:`rocFFT <rocfft:reference/api>`
+        - ❌/❌
+        - ❌/❌
+        - ✅/✅
+        - ❌/❌
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+
+      *
+        - :doc:`rocPRIM <rocprim:reference/data-type-support>`
+        - ❌/❌
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+
+      *
+        - :doc:`rocRAND <rocrand:api-reference/data-type-support>`
+        - NA/❌
+        - NA/❌
+        - NA/✅
+        - NA/❌
+        - NA/❌
+        - NA/✅
+        - NA/✅
+
+      *
+        - :doc:`rocSOLVER <rocsolver:reference/precision>`
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+
      *
        - :doc:`rocThrust <rocthrust:data-type-support>`
        - ❌/❌
@@ -559,62 +942,123 @@ detailed description.
        - ✅/✅
        - ✅/✅

+      *
+        - :doc:`rocWMMA <rocwmma:api-reference/api-reference-guide>`
+        - ✅/❌
+        - ✅/❌
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+
 .. note::
-   
+
  As random number generation libraries, rocRAND and hipRAND only specify output
  data types for the random values they generate, with no need for input data
  types.

-Libraries internal calculations type support
--------------------------------------------
+hipDataType enumeration
+-----------------------

-The following tables list ROCm library support for specific internal data types.
-Refer to the corresponding library data type support page for a detailed
-description.
+The ``hipDataType`` enumeration defines data precision types and is primarily
+used when the data reference itself does not include type information, such as
+in ``void*`` pointers. This enumeration is mainly utilized in BLAS libraries.
+The HIP type equivalents of the ``hipDataType`` enumeration are listed in the
+following table with descriptions and values.

-.. tab-set::
+.. list-table::
+    :header-rows: 1
+    :widths: 25,25,10,40

-  .. tab-item:: Integral types
-    :sync: integral-type
+    *
+      - hipDataType
+      - HIP type
+      - Value
+      - Description

-    .. list-table::
-      :header-rows: 1
+    *
+      - ``HIP_R_8I``
+      - ``int8_t``
+      - 3
+      - 8-bit real signed integer.

-      *
-        - Library internal data type name
-        - int8
-        - int16
-        - int32
-        - int64
-      *
-        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
-        - ❌
-        - ❌
-        - ✅
-        - ❌
+    *
+      - ``HIP_R_8U``
+      - ``uint8_t``
+      - 8
+      - 8-bit real unsigned integer.

+    *
+      - ``HIP_R_16I``
+      - ``int16_t``
+      - 20
+      - 16-bit real signed integer.

-  .. tab-item:: Floating-point types
-    :sync: floating-point-type
+    *
+      - ``HIP_R_16U``
+      - ``uint16_t``
+      - 22
+      - 16-bit real unsigned integer.

-    .. list-table::
-      :header-rows: 1
+    *
+      - ``HIP_R_32I``
+      - ``int32_t``
+      - 10
+      - 32-bit real signed integer.

-      *
-        - Library internal data type name
-        - float8 (E4M3)
-        - float8 (E5M2)
-        - float16
-        - bfloat16
-        - tensorfloat32
-        - float32
-        - float64
-      *
-        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
-        - ❌
-        - ❌
-        - ❌
-        - ❌
-        - ❌
-        - ✅
-        - ❌
+    *
+      - ``HIP_R_32U``
+      - ``uint32_t``
+      - 12
+      - 32-bit real unsigned integer.
+
+    *
+      - ``HIP_R_32F``
+      - ``float``
+      - 0
+      - 32-bit real single precision floating-point.
+
+    *
+      - ``HIP_R_64F``
+      - ``double``
+      - 1
+      - 64-bit real double precision floating-point.
+
+    *
+      - ``HIP_R_16F``
+      - ``half``
+      - 2
+      - 16-bit real half precision floating-point.
+
+    *
+      - ``HIP_R_16BF``
+      - ``bfloat16``
+      - 14
+      - 16-bit real bfloat16 precision floating-point.
+
+    *
+      - ``HIP_R_8F_E4M3``
+      - ``__hip_fp8_e4m3``
+      - 28
+      - 8-bit real float8 precision floating-point (OCP version).
+
+    *
+      - ``HIP_R_8F_E5M2``
+      - ``__hip_fp8_e5m2``
+      - 29
+      - 8-bit real bfloat8 precision floating-point (OCP version).
+
+    *
+      - ``HIP_R_8F_E4M3_FNUZ``
+      - ``__hip_fp8_e4m3_fnuz``
+      - 1000
+      - 8-bit real float8 precision floating-point (FNUZ version).
+
+    *
+      - ``HIP_R_8F_E5M2_FNUZ``
+      - ``__hip_fp8_e5m2_fnuz``
+      - 1001
+      - 8-bit real bfloat8 precision floating-point (FNUZ version).
+
+The full list of the ``hipDataType`` enumeration listed in `library_types.h <https://github.com/ROCm/hip/blob/amd-staging/include/hip/library_types.h>`_ .
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -19,9 +19,9 @@ subtrees:

 - caption: Install
  entries:
-  - url: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/
+  - url: https://rocm.docs.amd.com/projects/install-on-linux/en/${branch}/
    title: ROCm on Linux
-  - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/
+  - url: https://rocm.docs.amd.com/projects/install-on-windows/en/latest/
    title: HIP SDK on Windows
  - url: https://rocm.docs.amd.com/projects/radeon/en/latest/index.html
    title: ROCm on Radeon GPUs
@@ -82,6 +82,8 @@ subtrees:
            title: vLLM inference performance testing
          - file: how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
            title: PyTorch inference performance testing
+          - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
+            title: SGLang inference performance testing
          - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
            title: Deploy your model
Author	SHA1	Message	Date
Istvan Kiss	eee2e0d4fb	Update precision support page part I.	2025-07-31 14:21:17 +02:00
Joseph Macaranas	b2012cb0b9	External CI: rocm-libraries superbuild component yaml (#5125 ) - Subset of the hipblaslt component yaml, deleting extra gpu targets and the testing component. - Sparse checkout details removed. - Basic build flags from top-level invocation added.	2025-07-30 17:50:46 -04:00
Daniel Su	45cf2b9a80	[Ex CI] rocprof-systems: add libsqlite3-dev (#5124 ) Fixes rocprofiler-systems builds following ROCm/rocprofiler-systems@26ae543 Sample build: https://dev.azure.com/ROCm-CI/ROCm-CI/_build/results?buildId=41257&view=results	2025-07-30 16:05:26 -04:00
Daniel Su	bee363995b	[Ex CI] revert miopen-get-ck script change (#5123 )	2025-07-30 12:03:51 -07:00
Daniel Su	3a031fad3a	[Ex CI] disable MIOpen downstream jobs (#5122 )	2025-07-30 13:12:11 -04:00
Daniel Su	46f6c4ff9a	[Ex CI] enable MIOpen monorepo (#5117 ) * init * fix source dir * miopen specify test build dir * fix test build dir * revert change * fix test build again * move to ultra temporarily * miopen-get-ck, working dir * exclude flaky test * move back to high * Add MIVisionX and AMDMIGraphX downstream jobs to MIOpen * comment sparsecheckoutdir * quote component names * fix artifact name * miopen ck script exit on fail * add downstream checkout repos * mivisionx, add aomp	2025-07-30 09:53:52 -07:00
Pratik Basyal	f632f2879f	ROCm Software Stack image for 6.4.0 updated (#5112 )	2025-07-28 14:51:19 -04:00
yugang-amd	cc5bc5a882	Add SGLang inference benchmark doc w/ initial support for DeepSeek-R1-Distill-Qwen-32B (#4870 )	2025-07-25 12:42:40 -04:00
Daniel Su	2c9c3d0ba1	[Ex CI] switch hipBLAS/SPARSE pipeline IDs to monorepo (#5098 )	2025-07-24 16:53:29 -04:00
Peter Park	14249f24d8	Use `madengine` instead of tools/run_models.py in docs (#5095 )	2025-07-24 15:38:12 -04:00
Daniel Su	0e8045cca7	[Ex CI] enable hipBLAS monorepo (#5090 )	2025-07-24 12:37:34 -04:00
Daniel Su	541fe92947	[Ex CI] update to 6.4.2 (#5087 )	2025-07-23 14:10:40 -04:00
Daniel Su	628d5f8a19	[Ex CI] create Docker images for nightly builds (#5005 )	2025-07-23 12:16:11 -04:00
Peter Park	984a91f008	Add DeepSeek Janus Pro 7B to PyTorch inference benchmark doc (#5071 ) --------- Co-authored-by: yugang-amd <yugang.wang@amd.com>	2025-07-22 16:26:06 -04:00
amd-hsivasun	ae2cc6ab38	[EX CI] ROCR-Runtime: migrate from rocm-smi to amd-smi (#5088 ) * Update ROCR-Runtime.yml Migrate from rocmsmi to amdsmi * Update ROCR-Runtime.yml Removed libhwloc.so.5 install * Update ROCR-Runtime.yml Link to hwloc.so.5 * Update ROCR-Runtime.yml Added link in the rocrtst step * Update ROCR-Runtime.yml	2025-07-22 14:17:53 -04:00
Peter Park	15ee605d18	Fix branches for install docs in _toc.yml.in (#5083 )	2025-07-22 11:03:40 -04:00
anisha-amd	ae54add299	Sphinx warning for ROCm fixed (#5077 ) (#5082 ) * Sphinx warning for DGL fixed * Update dgl-compatibility.rst removed benchmark line and updated link --------- Co-authored-by: Pratik Basyal <prbasyal@amd.com>	2025-07-22 10:51:15 -04:00
Peter Park	2269e9d25d	Remove broken link to deprecated AMDGPU installer documentation (#5078 ) * remove link to deprecated AMDGPU installation method * add deep learning frameworks	2025-07-21 19:36:20 -04:00